In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
customers=pd.read_csv('/content/Customers.csv')
products=pd.read_csv('/content/Products.csv')
transactions=pd.read_csv('/content/Transactions.csv')
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


**Combining Customers and Products dataset with Transactions dataset**

In [None]:
transactions_with_customer = pd.merge(transactions, customers, on='CustomerID', how='inner')
df= pd.merge(transactions_with_customer, products, on='ProductID', how='inner')
df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [None]:
df['TransactionDate'] = pd.to_datetime(df['TransactionDate']).dt.date
df['SignupDate'] = pd.to_datetime(df['SignupDate']).dt.date

df['days_difference'] = (pd.to_datetime(df['TransactionDate']) - pd.to_datetime(df['SignupDate'])).dt.days

In [None]:
df['SignupDate'] = pd.to_datetime(df['SignupDate'], errors='coerce')
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'], errors='coerce')
df['SignupYear'] = df['SignupDate'].dt.year
df['SignupMonth'] = df['SignupDate'].dt.month

df['TransactionYear']=df['TransactionDate'].dt.year
df['TransactionMonth']=df['TransactionDate'].dt.month

In [None]:
df=df.drop(columns=['Price_y','CustomerName'])

In [None]:
df['days_difference']=abs(df["days_difference"])
df[['Brand', 'Product']] = df['ProductName'].str.split(' ', n=1, expand=True)

**Making new features based on EDA**

In [None]:
customer_features = df.groupby('CustomerID').agg({
    'TotalValue': 'mean',
    'Quantity': 'sum',
    'TransactionDate': lambda x: x.min(),
    'SignupDate': lambda x: x.min(),
    'Region': 'first',
    'Category': lambda x: x.mode()[0],
    'ProductName': lambda x: x.mode()[0]  #
}).reset_index()
customer_features['DaysToFirstTransaction'] = (
    abs((customer_features['TransactionDate'] - customer_features['SignupDate']).dt.days)
)

In [None]:
customer_features.shape

(199, 9)

In [None]:
customer_features.head()

Unnamed: 0,CustomerID,TotalValue,Quantity,TransactionDate,SignupDate,Region,Category,ProductName,DaysToFirstTransaction
0,C0001,670.904,12,2024-01-19,2022-07-10,South America,Electronics,ActiveWear Smartwatch,558
1,C0002,465.685,10,2024-02-28,2022-02-13,Asia,Clothing,BookWorld Cookware Set,745
2,C0003,681.345,14,2024-02-18,2024-03-07,South America,Home Decor,ActiveWear Cookware Set,18
3,C0004,669.36,23,2024-02-28,2022-10-09,South America,Books,ActiveWear Cookware Set,507
4,C0005,678.08,7,2024-03-15,2022-08-15,Asia,Electronics,ActiveWear Cookware Set,578


**Preprocessing**

Scaling

Encoding



In [None]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

features = customer_features.drop(columns=['CustomerID'])
scaler = MinMaxScaler()
numerical_features = scaler.fit_transform(features[['TotalValue', 'Quantity', 'DaysToFirstTransaction']])
encoder = OneHotEncoder()
categorical_features = encoder.fit_transform(features[['Region', 'Category', 'ProductName']]).toarray()
final_features = np.hstack([numerical_features, categorical_features])


**Computing similarity**

In [None]:
cosine_sim = cosine_similarity(final_features)

cosine_sim_df = pd.DataFrame(cosine_sim, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])
cosine_sim_df


CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.000000,0.190472,0.390826,0.465066,0.451416,0.405421,0.729940,0.098967,0.366711,0.148781,...,0.379370,0.716373,0.166433,0.387766,0.423167,0.255054,0.422355,0.194950,0.448099,0.186686
C0002,0.190472,1.000000,0.078970,0.198196,0.453175,0.095192,0.461691,0.074434,0.365195,0.430410,...,0.066334,0.164943,0.427704,0.078745,0.125206,0.248758,0.129054,0.481885,0.188015,0.686781
C0003,0.390826,0.078970,1.000000,0.685273,0.375595,0.427814,0.112496,0.405510,0.035491,0.085472,...,0.399873,0.370522,0.115716,0.106895,0.705623,0.403397,0.097851,0.051692,0.079769,0.151162
C0004,0.465066,0.198196,0.685273,1.000000,0.442469,0.679124,0.203975,0.153211,0.070831,0.170036,...,0.671007,0.433035,0.449098,0.412054,0.451692,0.263110,0.153622,0.179344,0.182044,0.218046
C0005,0.451416,0.453175,0.375595,0.442469,1.000000,0.123557,0.730591,0.072363,0.069878,0.136932,...,0.078629,0.433985,0.434564,0.084795,0.126542,0.248829,0.416438,0.199742,0.444524,0.437736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,0.255054,0.248758,0.403397,0.263110,0.248829,0.208866,0.283601,0.368238,0.360989,0.441990,...,0.131832,0.213052,0.232602,0.138536,0.441694,1.000000,0.446507,0.489397,0.716743,0.261553
C0197,0.422355,0.129054,0.097851,0.153622,0.416438,0.127199,0.428143,0.083862,0.361743,0.401506,...,0.083118,0.405685,0.130383,0.088410,0.409345,0.446507,1.000000,0.395578,0.694825,0.152246
C0198,0.194950,0.481885,0.051692,0.179344,0.199742,0.072155,0.214910,0.035314,0.642158,0.686270,...,0.044878,0.175615,0.152586,0.053164,0.101421,0.489397,0.395578,1.000000,0.462106,0.399774
C0199,0.448099,0.188015,0.079769,0.182044,0.444524,0.098967,0.453247,0.072341,0.366517,0.423305,...,0.067317,0.432522,0.145920,0.077511,0.119471,0.716743,0.694825,0.462106,1.000000,0.155922


**Computing top 3 customers**

In [None]:
def get_top_similar(customers, similarity_df, top_n=3):
    lookalike_data = {}
    for cust_id in customers:

        top_similar = similarity_df.loc[cust_id].drop(cust_id).sort_values(ascending=False).head(top_n)
        lookalike_data[cust_id] = list(zip(top_similar.index, top_similar.values))
    return lookalike_data

customer_subset = customer_features['CustomerID'][:20]
lookalikes = get_top_similar(customer_subset, cosine_sim_df)

**Saving to file**

In [None]:
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': lookalike}
    for cust_id, lookalike in lookalikes.items()
])
lookalike_df.to_csv('Lookalike9.csv', index=False)

**Inspecting**

In [None]:
customer_features[customer_features['CustomerID']=='C0001']


Unnamed: 0,CustomerID,TotalValue,Quantity,TransactionDate,SignupDate,Region,Category,ProductName,DaysToFirstTransaction
0,C0001,670.904,12,2024-01-19,2022-07-10,South America,Electronics,ActiveWear Smartwatch,558


In [None]:
customer_features[customer_features['CustomerID']=='C0184']

Unnamed: 0,CustomerID,TotalValue,Quantity,TransactionDate,SignupDate,Region,Category,ProductName,DaysToFirstTransaction
182,C0184,484.74,11,2024-01-03,2022-05-13,South America,Electronics,ActiveWear Smartwatch,600


In [None]:
customer_features[customer_features['CustomerID']=='C0181']

Unnamed: 0,CustomerID,TotalValue,Quantity,TransactionDate,SignupDate,Region,Category,ProductName,DaysToFirstTransaction
179,C0181,557.933333,15,2024-04-12,2023-11-16,South America,Electronics,ActiveWear Smartwatch,148


In [None]:
customer_features[customer_features['CustomerID']=='C0112']