<a href="https://colab.research.google.com/github/LoguPrasanth-hub/Lookalike-Model/blob/main/Lookalike_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, StandardScaler

In [None]:
# Importing the datasets
customer = pd.read_csv('Customers.csv')
customer.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [None]:
products=pd.read_csv('Products.csv')
products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [None]:
transaction=pd.read_csv('Transactions.csv')
transaction.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [None]:
# Shape of the datasets
customer.shape, transaction.shape, products.shape

((200, 4), (1000, 7), (100, 4))

In [None]:
# Merging the three datasets into one
data=transaction.merge(customer, on='CustomerID').merge(products,on='ProductID')
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [None]:
# Creating aggerating groupby
feature=data.groupby('CustomerID').agg(region=('Region','first'),
                                       total_spend=('TotalValue','sum'),
                                       avg_spend=('TotalValue','mean'),
                                       purchase_count=('TransactionID','count'),
                                       quantity_purchase=('Quantity','sum'),
                                       most_purchase_category=('Category',lambda x:x.mode()[0]),
                                       most_purchase_product=('ProductID',lambda x:x.mode()[0])).reset_index()
feature.head()

Unnamed: 0,CustomerID,region,total_spend,avg_spend,purchase_count,quantity_purchase,most_purchase_category,most_purchase_product
0,C0001,South America,3354.52,670.904,5,12,Electronics,P022
1,C0002,Asia,1862.74,465.685,4,10,Clothing,P004
2,C0003,South America,2725.38,681.345,4,14,Home Decor,P002
3,C0004,South America,5354.88,669.36,8,23,Books,P008
4,C0005,Asia,2034.24,678.08,3,7,Electronics,P012


In [None]:
# Scaling the numerical columns
scaler=StandardScaler()
numerical=['total_spend','avg_spend','purchase_count','quantity_purchase']
feature[numerical]=scaler.fit_transform(feature[numerical])
feature.head()

Unnamed: 0,CustomerID,region,total_spend,avg_spend,purchase_count,quantity_purchase,most_purchase_category,most_purchase_product
0,C0001,South America,-0.061701,-0.070263,-0.011458,-0.122033,Electronics,P022
1,C0002,Asia,-0.877744,-0.934933,-0.467494,-0.448,Clothing,P004
2,C0003,South America,-0.405857,-0.026271,-0.467494,0.203934,Home Decor,P002
3,C0004,South America,1.032547,-0.076769,1.35665,1.670787,Books,P008
4,C0005,Asia,-0.783929,-0.040028,-0.92353,-0.936951,Electronics,P012


In [None]:
# Encoding on categorical column
onehot = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Onehot encode on region column
region = onehot.fit_transform(feature[['region']])
region_df = pd.DataFrame(region, columns=onehot.get_feature_names_out(['region']))

# Onehot encode on purchase column
purchase= onehot.fit_transform(feature[['most_purchase_category']])
purchase_df = pd.DataFrame(purchase, columns=onehot.get_feature_names_out(['most_purchase_category']))

feature=pd.concat([feature,region_df,purchase_df],axis=1)
feature.head()

Unnamed: 0,CustomerID,region,total_spend,avg_spend,purchase_count,quantity_purchase,most_purchase_category,most_purchase_product,region_Asia,region_Europe,region_North America,region_South America,most_purchase_category_Books,most_purchase_category_Clothing,most_purchase_category_Electronics,most_purchase_category_Home Decor
0,C0001,South America,-0.061701,-0.070263,-0.011458,-0.122033,Electronics,P022,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,C0002,Asia,-0.877744,-0.934933,-0.467494,-0.448,Clothing,P004,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,C0003,South America,-0.405857,-0.026271,-0.467494,0.203934,Home Decor,P002,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,C0004,South America,1.032547,-0.076769,1.35665,1.670787,Books,P008,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,C0005,Asia,-0.783929,-0.040028,-0.92353,-0.936951,Electronics,P012,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# Droping the column
feature=feature.drop(['region','most_purchase_category'],axis=1)
feature.shape

(199, 14)

In [None]:
# Label Encoding on categorical column
encoder=LabelEncoder()
feature['most_purchase_product']=encoder.fit_transform(feature['most_purchase_product'])
feature.head()

Unnamed: 0,CustomerID,total_spend,avg_spend,purchase_count,quantity_purchase,most_purchase_product,region_Asia,region_Europe,region_North America,region_South America,most_purchase_category_Books,most_purchase_category_Clothing,most_purchase_category_Electronics,most_purchase_category_Home Decor
0,C0001,-0.061701,-0.070263,-0.011458,-0.122033,21,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,C0002,-0.877744,-0.934933,-0.467494,-0.448,3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,C0003,-0.405857,-0.026271,-0.467494,0.203934,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,C0004,1.032547,-0.076769,1.35665,1.670787,7,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,C0005,-0.783929,-0.040028,-0.92353,-0.936951,11,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# Modeling cosine Similarity
similarity=cosine_similarity(feature.drop('CustomerID',axis=1))
similarity

array([[1.        , 0.8304829 , 0.56492295, ..., 0.99645615, 0.96785798,
        0.99548534],
       [0.8304829 , 1.        , 0.52439488, ..., 0.85145717, 0.87350765,
        0.8293197 ],
       [0.56492295, 0.52439488, 1.        , ..., 0.54852164, 0.55303422,
        0.53583039],
       ...,
       [0.99645615, 0.85145717, 0.54852164, ..., 1.        , 0.97223224,
        0.99540475],
       [0.96785798, 0.87350765, 0.55303422, ..., 0.97223224, 1.        ,
        0.95271577],
       [0.99548534, 0.8293197 , 0.53583039, ..., 0.99540475, 0.95271577,
        1.        ]])

In [None]:
similarity_scores = []
for idx, row in enumerate(similarity):
    similar_indices = np.argsort(row)[::-1][1:4]  # Skip self (index 0)
    similar_customers = [(feature.loc[i, 'CustomerID'], round(row[1],3)) for i in similar_indices]
    similarity_scores.append((feature.loc[idx, 'CustomerID'], similar_customers))

In [None]:
lookalike_data = {
    'cust_id': [entry[0] for entry in similarity_scores],
    'lookalikes': [entry[1] for entry in similarity_scores]
}

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df=lookalike_df.head(20)
lookalike_df

Unnamed: 0,cust_id,lookalikes
0,C0001,"[(C0190, 0.83), (C0112, 0.83), (C0184, 0.83)]"
1,C0002,"[(C0134, 1.0), (C0088, 1.0), (C0103, 1.0)]"
2,C0003,"[(C0031, 0.524), (C0178, 0.524), (C0137, 0.524)]"
3,C0004,"[(C0075, 0.69), (C0041, 0.69), (C0175, 0.69)]"
4,C0005,"[(C0140, 0.879), (C0128, 0.879), (C0110, 0.879)]"
5,C0006,"[(C0168, 0.802), (C0171, 0.802), (C0011, 0.802)]"
6,C0007,"[(C0020, 0.838), (C0115, 0.838), (C0146, 0.838)]"
7,C0008,"[(C0175, 0.512), (C0139, 0.512), (C0124, 0.512)]"
8,C0009,"[(C0198, 0.858), (C0033, 0.858), (C0060, 0.858)]"
9,C0010,"[(C0121, 0.885), (C0009, 0.885), (C0083, 0.885)]"


In [None]:
# Exporting Lookalike Dataframe
lookalike_df.to_csv('Lookalike.csv',index=False)