In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [10]:
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')


In [11]:
merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')

In [12]:
customer_features = merged_data.groupby('CustomerID').agg({'TotalValue': 'sum', 'TransactionID': 'nunique',  'ProductID': 'nunique', }).reset_index()

In [13]:
customer_features['Region'] = customers['Region']
customer_features['SignupYear'] = pd.to_datetime(customers['SignupDate']).dt.year

In [14]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[['TotalValue', 'TransactionID', 'ProductID', 'SignupYear']])

In [16]:
similarity_matrix = cosine_similarity(scaled_features)

In [18]:
lookalike_dict = {}
for i in range(min(20, len(customers))):  
    similar_customers = np.argsort(similarity_matrix[i])[-4:-1]  
    
   
    similar_customers = [j for j in similar_customers if j != i]
    

    similar_scores = similarity_matrix[i][similar_customers]
    
    
    lookalike_dict[customers['CustomerID'][i]] = [
        (customers['CustomerID'][j], similar_scores[j]) for j in range(min(3, len(similar_scores)))
    ]

In [19]:
lookalike_list = []
for customer, recommendations in lookalike_dict.items():
    for rec in recommendations:
        lookalike_list.append([customer, rec[0], rec[1]])

lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

In [20]:
lookalike_df.to_csv('Harish_Karthik_Lookalike.csv', index=False)

In [21]:
lookalike_df.head()

Unnamed: 0,CustomerID,LookalikeCustomerID,SimilarityScore
0,C0001,C0001,0.984819
1,C0001,C0002,0.987127
2,C0001,C0003,0.99991
3,C0002,C0001,0.999222
4,C0002,C0002,0.999452
