# Lookalike Model

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [3]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [4]:
# Merge the datasets
merged_data = transactions.merge(customers, on="CustomerID", how="left")
merged_data = merged_data.merge(products, on="ProductID", how="left")

In [6]:
merged_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [7]:
# Dropping one column and renaming the other
merged_data['Price'] = merged_data['Price_x']
merged_data.drop(columns=['Price_x', 'Price_y'], inplace=True)


In [8]:
#Feature Engineering: Aggregating customer transaction history
customer_features = merged_data.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'nunique'),
    avg_spend_per_transaction=('TotalValue', 'mean')
).reset_index()

In [10]:
customer_features.head()

Unnamed: 0,CustomerID,total_spending,num_transactions,avg_spend_per_transaction
0,C0001,3354.52,5,670.904
1,C0002,1862.74,4,465.685
2,C0003,2725.38,4,681.345
3,C0004,5354.88,8,669.36
4,C0005,2034.24,3,678.08


In [11]:
# Normalizing the customer features using StandardScaler
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features[['total_spending', 'num_transactions', 'avg_spend_per_transaction']])

In [12]:
# Adding the normalized features to the customer features dataframe
normalized_customer_features = pd.DataFrame(normalized_features, columns=['total_spending', 'num_transactions', 'avg_spend_per_transaction'])
customer_features = customer_features.drop(columns=['total_spending', 'num_transactions', 'avg_spend_per_transaction'])
customer_features = pd.concat([customer_features, normalized_customer_features], axis=1)

In [13]:
customer_features.head()

Unnamed: 0,CustomerID,total_spending,num_transactions,avg_spend_per_transaction
0,C0001,-0.061701,-0.011458,-0.070263
1,C0002,-0.877744,-0.467494,-0.934933
2,C0003,-0.405857,-0.467494,-0.026271
3,C0004,1.032547,1.35665,-0.076769
4,C0005,-0.783929,-0.92353,-0.040028


In [14]:
# Computing cosine similarity matrix
similarity_matrix = cosine_similarity(customer_features.iloc[:, 1:])  # Exclude 'CustomerID' column

# Converingt the similarity matrix into a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, columns=customer_features['CustomerID'], index=customer_features['CustomerID'])

# Displaing the similarity matrix for a few customers
print(similarity_df.loc['C0001'].head())


CustomerID
C0001    1.000000
C0002    0.973684
C0003    0.552358
C0004   -0.459392
C0005    0.540910
Name: C0001, dtype: float64


In [18]:
# Function to get top 3 lookalikes for each customer
def get_top_lookalikes(customer_id, top_n=3):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:top_n+1]
    return [(cust_id, score) for cust_id, score in zip(similar_customers.index, similar_customers.values)]

# Getting top 3 lookalikes for customers C0001 to C0020
lookalike_data = {}
for customer_id in customer_features['CustomerID'][:20]:
    lookalike_data[customer_id] = get_top_lookalikes(customer_id)

# Converting the result to a DataFrame
lookalike_df = pd.DataFrame.from_dict(lookalike_data, orient='index', columns=['Lookalike_1', 'Lookalike_2', 'Lookalike_3'])

print(lookalike_df)

                       Lookalike_1                  Lookalike_2  \
C0001  (C0137, 0.9993600788417096)  (C0152, 0.9956575062125335)   
C0002   (C0029, 0.999637959673112)  (C0199, 0.9988672178177445)   
C0003  (C0005, 0.9998942821541472)   (C0178, 0.999565130088116)   
C0004  (C0067, 0.9999912751252354)  (C0021, 0.9996580546333909)   
C0005  (C0003, 0.9998942821541472)  (C0073, 0.9994945201465776)   
C0006  (C0079, 0.9999831359034949)  (C0117, 0.9984084372142995)   
C0007  (C0085, 0.9998000368261726)  (C0140, 0.9978130376911358)   
C0008  (C0084, 0.9957164129456433)  (C0194, 0.9934494400280589)   
C0009  (C0077, 0.9998324515798348)  (C0032, 0.9979902268753805)   
C0010  (C0029, 0.9997701755882139)  (C0025, 0.9993912464818911)   
C0011  (C0183, 0.9998294251645359)   (C0048, 0.999827988866904)   
C0012  (C0136, 0.9997460356266585)  (C0102, 0.9997366446793621)   
C0013  (C0045, 0.9999845161035829)  (C0143, 0.9999344393626719)   
C0014   (C0128, 0.998457997306867)  (C0058, 0.9954272208198851