In [1]:
# Import required libraries
import pandas as pd

# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Check the data to ensure it is loaded correctly
print(customers.head())
print(products.head())
print(transactions.head())

# Merge datasets
merged_data = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')

# Check the merged data
print(merged_data.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [2]:
# Feature Engineering: Create a Customer Profile based on Transaction Data
customer_profile = merged_data.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),
    avg_spending=('TotalValue', 'mean'),
    purchase_frequency=('TransactionID', 'count')
).reset_index()

# Create product category-wise spending profile for each customer
category_spending = merged_data.groupby(['CustomerID', 'Category']).agg(
    category_spending=('TotalValue', 'sum')
).reset_index()

# Pivot the data to create a user-item matrix for similarity calculations
category_matrix = category_spending.pivot_table(index='CustomerID', columns='Category', values='category_spending', fill_value=0)

# Check the customer profile and category matrix
print(customer_profile.head())
print(category_matrix.head())


  CustomerID  total_spending  avg_spending  purchase_frequency
0      C0001         3354.52       670.904                   5
1      C0002         1862.74       465.685                   4
2      C0003         2725.38       681.345                   4
3      C0004         5354.88       669.360                   8
4      C0005         2034.24       678.080                   3
Category      Books  Clothing  Electronics  Home Decor
CustomerID                                            
C0001        114.60      0.00      2827.30      412.62
C0002          0.00   1025.46         0.00      837.28
C0003          0.00    122.36      1385.20     1217.82
C0004       1888.48      0.00      1355.74     2110.66
C0005          0.00      0.00      1180.38      853.86


In [3]:
from sklearn.preprocessing import StandardScaler

# Normalize the features (optional but often helps in similarity-based models)
scaler = StandardScaler()
category_matrix_scaled = scaler.fit_transform(category_matrix)

# Check the normalized category matrix
print(category_matrix_scaled[:5])



[[-0.84236036 -0.89978761  1.99532205 -0.43563089]
 [-0.95580154  0.20520014 -0.94466953  0.09959218]
 [-0.95580154 -0.7679382   0.49574229  0.57920832]
 [ 0.91358246 -0.89978761  0.46510806  1.7045052 ]
 [-0.95580154 -0.89978761  0.2827585   0.1204889 ]]


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate Cosine Similarity between customers
similarity_matrix = cosine_similarity(category_matrix_scaled)

# Convert similarity matrix to DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, index=category_matrix.index, columns=category_matrix.index)

# Check the similarity matrix
print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000 -0.402215  0.648350  0.043313  0.661203 -0.960708   
C0002      -0.402215  1.000000  0.175482 -0.446094  0.257825  0.235584   
C0003       0.648350  0.175482  1.000000  0.328565  0.932178 -0.734670   
C0004       0.043313 -0.446094  0.328565  1.000000  0.092857 -0.005891   
C0005       0.661203  0.257825  0.932178  0.092857  1.000000 -0.814067   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.637812 -0.268011  0.171019 -0.381244  ... -0.059019  0.830892   
C0002       0.166689  0.470266  0.588281  0.703980  ... -0.527737 -0.050379   
C0003       0.996881  0.202597  0.198752 -0.372100  ... -0.448319  0.462407   
C0004       0.347577  0.112209 -0.725347 -0.913367  ...  0.068221 -0.451726   
C0005  

In [5]:
# Recommendation: Top 3 Lookalikes for each customer
lookalike_map = {}
for cust_id in customers['CustomerID'][:20]:  # For customers C0001 - C0020
    similar_customers = similarity_df[cust_id].sort_values(ascending=False).iloc[1:4]  # Exclude the customer itself
    lookalike_map[cust_id] = list(zip(similar_customers.index, similar_customers.values))

# Check the lookalike map for the first customer
print(lookalike_map['C0001'])


[('C0091', 0.9888478853919915), ('C0069', 0.984343969157011), ('C0184', 0.9785619388073008)]


In [6]:
# Convert the result to DataFrame and save to Lookalike.csv
lookalike_df = pd.DataFrame([(cust_id, similar_customer, score) 
                             for cust_id, similar_customers in lookalike_map.items()
                             for similar_customer, score in similar_customers],
                            columns=['CustomerID', 'Lookalike_CustomerID', 'Similarity_Score'])

# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike model completed and saved as Lookalike.csv")



Lookalike model completed and saved as Lookalike.csv
