In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


In [4]:
dfCustomers = pd.read_csv('/content/Customers.csv')
dfProducts = pd.read_csv('/content/Products.csv')
dfTransactions = pd.read_csv('/content/Transactions.csv')

In [7]:
#for printing the csv files
dfProducts.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [9]:
dfTransactions = dfTransactions.merge(dfProducts, on='ProductID')
data = dfTransactions.merge(dfCustomers, on='CustomerID')
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName_x,Category_x,Price_y,ProductName_y,Category_y,Price,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [23]:
# Feature engineering
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Price': 'mean',
    'Category_x': lambda x: x.nunique(),
}).reset_index()

customer_features = customer_features.merge(dfCustomers[['CustomerID', 'Region']], on='CustomerID', how='left')

customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)
customer_features

Unnamed: 0,CustomerID,TotalValue,TransactionID,Price,Category_x,Region_Europe,Region_North America,Region_South America
0,C0001,3354.52,5,278.334000,3,False,False,True
1,C0002,1862.74,4,208.920000,2,False,False,False
2,C0003,2725.38,4,195.707500,3,False,False,True
3,C0004,5354.88,8,240.636250,3,False,False,True
4,C0005,2034.24,3,291.603333,2,False,False,False
...,...,...,...,...,...,...,...,...
194,C0196,4982.88,4,416.992500,3,True,False,False
195,C0197,1928.65,3,227.056667,2,True,False,False
196,C0198,931.83,2,239.705000,2,True,False,False
197,C0199,1979.28,4,250.610000,2,True,False,False


In [26]:
scaler = StandardScaler()
feature_matrix = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))
feature_matrix
similarity_matrix = cosine_similarity(feature_matrix)
similarity_matrix

array([[ 1.        , -0.15972041,  0.78968254, ..., -0.34983286,
        -0.44423466, -0.05673672],
       [-0.15972041,  1.        ,  0.25591326, ...,  0.50219016,
         0.38867399, -0.28827427],
       [ 0.78968254,  0.25591326,  1.        , ..., -0.02103163,
        -0.19120634, -0.21856065],
       ...,
       [-0.34983286,  0.50219016, -0.02103163, ...,  1.        ,
         0.93829224, -0.48152458],
       [-0.44423466,  0.38867399, -0.19120634, ...,  0.93829224,
         1.        , -0.49344024],
       [-0.05673672, -0.28827427, -0.21856065, ..., -0.48152458,
        -0.49344024,  1.        ]])

In [27]:
lookalike_results = {}
for i, customer_id in enumerate(customer_features['CustomerID'][:20]):
    similarities = list(enumerate(similarity_matrix[i]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_features['CustomerID'][idx], score) for idx, score in similarities[1:4]]  # Exclude self
    lookalike_results[customer_id] = top_3


In [29]:
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalike_results.keys()),
    'Lookalikes': [str(val) for val in lookalike_results.values()]
})
lookalike_df.to_csv('/content/FirstName_LastName_lookalike.csv', index=False)

print("Lookalike model completed. Results saved to 'Lookalike.csv'.")


Lookalike model completed. Results saved to 'Lookalike.csv'.
