In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
data = pd.merge(transactions, customers, on="CustomerID", how="left")
data = pd.merge(data, products, on="ProductID", how="left")

In [17]:
#Aggregate features for each customer
customer_features = data.groupby('CustomerID').agg({'TotalValue': 'sum','TransactionID': 'count','Category': lambda x: x.mode()[0],}).reset_index()

In [5]:
#Encode categorical data
customer_features = pd.get_dummies(customer_features, columns=['Category'], prefix='Cat')

In [6]:
#Normalize numerical features
scaler = StandardScaler()
numeric_cols = ['TotalValue', 'TransactionID']
customer_features[numeric_cols] = scaler.fit_transform(customer_features[numeric_cols])

In [7]:
#Compute similarity using cosine similarity
customer_ids = customer_features['CustomerID']
feature_matrix = customer_features.drop(columns=['CustomerID'])
similarity_matrix = cosine_similarity(feature_matrix)

In [8]:
#Find top 3 similar customers for each target customer
def get_top_similar_customers(similarity_matrix, customer_ids, top_n=3):
    similar_customers = {}
    for idx, cust_id in enumerate(customer_ids):
        scores = list(enumerate(similarity_matrix[idx]))
        scores = sorted(scores, key=lambda x: x[1], reverse=True)
        top_customers = [(customer_ids[i], round(score, 3)) for i, score in scores[1:top_n+1]]
        similar_customers[cust_id] = top_customers
    return similar_customers

In [9]:
#Generate lookalikes for the first 20 customers
target_customers = customer_ids[customer_ids.isin([f'C{i:04d}' for i in range(1, 21)])]
lookalikes = get_top_similar_customers(similarity_matrix, customer_ids)

In [10]:
#Filter lookalikes for the target customers
lookalikes_filtered = {cust_id: lookalikes[cust_id] for cust_id in target_customers}

In [11]:
#Save the results as CSV file
lookalike_df = pd.DataFrame({
    'CustomerID': lookalikes_filtered.keys(),
    'Lookalikes': [str(value) for value in lookalikes_filtered.values()]
})
lookalike_df.to_csv('Lokesh_A_Lookalike.csv', index=False)

In [16]:
pd.set_option('display.max_colwidth', None)
lookalike_df.head(20)

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0072', np.float64(0.982)), ('C0190', np.float64(0.981)), ('C0069', np.float64(0.969))]"
1,C0002,"[('C0029', np.float64(1.0)), ('C0010', np.float64(0.999)), ('C0062', np.float64(0.97))]"
2,C0003,"[('C0178', np.float64(1.0)), ('C0052', np.float64(0.995)), ('C0166', np.float64(0.991))]"
3,C0004,"[('C0021', np.float64(1.0)), ('C0101', np.float64(0.999)), ('C0173', np.float64(0.995))]"
4,C0005,"[('C0112', np.float64(1.0)), ('C0197', np.float64(1.0)), ('C0186', np.float64(0.997))]"
5,C0006,"[('C0117', np.float64(0.996)), ('C0168', np.float64(0.954)), ('C0064', np.float64(0.919))]"
6,C0007,"[('C0120', np.float64(0.999)), ('C0005', np.float64(0.984)), ('C0050', np.float64(0.983))]"
7,C0008,"[('C0113', np.float64(0.951)), ('C0124', np.float64(0.908)), ('C0109', np.float64(0.853))]"
8,C0009,"[('C0077', np.float64(1.0)), ('C0083', np.float64(0.999)), ('C0062', np.float64(0.986))]"
9,C0010,"[('C0029', np.float64(1.0)), ('C0002', np.float64(0.999)), ('C0009', np.float64(0.971))]"
