In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
customers = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Customers.csv')
products = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Products.csv')
transactions = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Transactions.csv')
print(customers)

    CustomerID        CustomerName         Region  SignupDate
0        C0001    Lawrence Carroll  South America  2022-07-10
1        C0002      Elizabeth Lutz           Asia  2022-02-13
2        C0003      Michael Rivera  South America  2024-03-07
3        C0004  Kathleen Rodriguez  South America  2022-10-09
4        C0005         Laura Weber           Asia  2022-08-15
..         ...                 ...            ...         ...
195      C0196         Laura Watts         Europe  2022-06-07
196      C0197    Christina Harvey         Europe  2023-03-21
197      C0198         Rebecca Ray         Europe  2022-02-27
198      C0199      Andrea Jenkins         Europe  2022-12-03
199      C0200         Kelly Cross           Asia  2023-06-11

[200 rows x 4 columns]


In [11]:
# Merge DataFrames
merged_df = pd.merge(transactions, customers, on='CustomerID')
merged_df = pd.merge(merged_df, products, on='ProductID')

In [10]:
# Feature Engineering - Create Customer Profiles based on Transaction Data
customer_profiles = merged_df.groupby(['CustomerID', 'ProductID']).agg({
    'Quantity': 'sum', 'TotalValue': 'sum'
}).reset_index()
print(customer_profiles)

    CustomerID ProductID  Quantity  TotalValue
0        C0001      P022         3      412.62
1        C0001      P029         3     1300.92
2        C0001      P054         2      114.60
3        C0001      P083         2      911.44
4        C0001      P096         2      614.94
..         ...       ...       ...         ...
969      C0200      P034         4      868.20
970      C0200      P048         4     1665.60
971      C0200      P057         1      239.70
972      C0200      P061         4      627.84
973      C0200      P064         3     1357.26

[974 rows x 4 columns]


In [12]:
# Pivot to Create Customer-Product Matrix
customer_product_matrix = customer_profiles.pivot(index='CustomerID', columns='ProductID', values='TotalValue').fillna(0)

In [14]:
# Compute Cosine Similarity
similarity_matrix = cosine_similarity(customer_product_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_product_matrix.index, columns=customer_product_matrix.index)
print(similarity_df)

CustomerID     C0001  C0002     C0003     C0004     C0005     C0006     C0007  \
CustomerID                                                                      
C0001       1.000000    0.0  0.000000  0.000000  0.000000  0.000000  0.203038   
C0002       0.000000    1.0  0.000000  0.000000  0.000000  0.000000  0.000000   
C0003       0.000000    0.0  1.000000  0.139782  0.347737  0.000000  0.000000   
C0004       0.000000    0.0  0.139782  1.000000  0.186362  0.000000  0.000000   
C0005       0.000000    0.0  0.347737  0.186362  1.000000  0.000000  0.000000   
...              ...    ...       ...       ...       ...       ...       ...   
C0196       0.000000    0.0  0.000000  0.000000  0.000000  0.369701  0.000000   
C0197       0.000000    0.0  0.000000  0.000000  0.000000  0.368321  0.000000   
C0198       0.000000    0.0  0.000000  0.000000  0.000000  0.000000  0.000000   
C0199       0.119181    0.0  0.000000  0.038244  0.000000  0.000000  0.000000   
C0200       0.000000    0.0 

In [15]:
# Function to get top N similar customers
def get_top_n_similar_customers(customer_id, n=3):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).drop(customer_id).head(n)
    return list(similar_customers.index), list(similar_customers.values)

In [16]:
# Generate Lookalike Recommendations for the first 20 customers
lookalike_recommendations = {}
for customer_id in customers['CustomerID'][:20]:
    similar_customers, similarity_scores = get_top_n_similar_customers(customer_id)
    lookalike_recommendations[customer_id] = list(zip(similar_customers, similarity_scores))

In [18]:
# Create Lookalike.csv
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalike_recommendations.keys()),
    'Lookalikes': [x for x in lookalike_recommendations.values()]
})
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike Recommendations for the first 20 customers saved to 'Lookalike.csv'")


Lookalike Recommendations for the first 20 customers saved to 'Lookalike.csv'


In [19]:
# Display Lookalike Recommendations
print(lookalike_df.head(20))

   CustomerID                                         Lookalikes
0       C0001  [(C0050, 0.5326650751975152), (C0100, 0.528643...
1       C0002  [(C0109, 0.5363798777066622), (C0079, 0.528552...
2       C0003  [(C0181, 0.6200214220870752), (C0186, 0.556468...
3       C0004  [(C0063, 0.45920773392365694), (C0070, 0.44279...
4       C0005  [(C0096, 0.6683801091411397), (C0192, 0.641821...
5       C0006  [(C0058, 0.6833174088001752), (C0040, 0.662619...
6       C0007  [(C0020, 0.5798921734396695), (C0031, 0.442596...
7       C0008  [(C0165, 0.47995825082574334), (C0169, 0.42354...
8       C0009  [(C0140, 0.7865578016626213), (C0156, 0.720016...
9       C0010  [(C0029, 0.8903528709045399), (C0092, 0.865016...
10      C0011  [(C0135, 0.6044043836963852), (C0173, 0.422773...
11      C0012  [(C0164, 0.6931189300367439), (C0173, 0.626648...
12      C0013  [(C0058, 0.5569830949313298), (C0040, 0.540111...
13      C0014  [(C0128, 0.5740534126137472), (C0159, 0.441217...
14      C0015  [(C0073, 0