In [31]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [32]:
##Load datasets
transactions = pd.read_csv("Transactions.csv")
products = pd.read_csv("Products.csv")
customers = pd.read_csv("Customers.csv")

In [33]:
# Merge datasets
merged_data = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')

In [34]:
merged_data.columns

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')

# Now converting the columns of merged data into various features that could represent a customer ID

In [35]:
# Feature engineering: Customer-level features
customer_features = merged_data.groupby('CustomerID').agg(
    TotalSpending=('TotalValue', 'sum'),
    NumTransactions=('TransactionID', 'count'),
    UniqueCategories=('Category', 'nunique'),
    AvgTransactionValue=('TotalValue', 'mean')
).reset_index()


In [36]:
customer_features.columns

Index(['CustomerID', 'TotalSpending', 'NumTransactions', 'UniqueCategories',
       'AvgTransactionValue'],
      dtype='object')

In [37]:
customer_features

Unnamed: 0,CustomerID,TotalSpending,NumTransactions,UniqueCategories,AvgTransactionValue
0,C0001,3354.52,5,3,670.904000
1,C0002,1862.74,4,2,465.685000
2,C0003,2725.38,4,3,681.345000
3,C0004,5354.88,8,3,669.360000
4,C0005,2034.24,3,2,678.080000
...,...,...,...,...,...
194,C0196,4982.88,4,3,1245.720000
195,C0197,1928.65,3,2,642.883333
196,C0198,931.83,2,2,465.915000
197,C0199,1979.28,4,2,494.820000


In [38]:
# Feature engineering: Product category frequency per customer
category_frequency = merged_data.groupby(['CustomerID', 'Category']).size().unstack(fill_value=0)

In [39]:
category_frequency

Category,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,1,0,3,1
C0002,0,2,0,2
C0003,0,1,1,2
C0004,3,0,2,3
C0005,0,0,2,1
...,...,...,...,...
C0196,1,1,0,2
C0197,0,0,2,1
C0198,0,1,1,0
C0199,0,0,2,2


In [40]:
# Combine all features
features = customer_features.merge(category_frequency, on='CustomerID', how='left')
features.set_index('CustomerID', inplace=True)

In [41]:
features.corr(numeric_only=True)
# this shows that the columns that we have finally used are less correlated hence there is no redundancy in columns , thus they rightfully represent a customer ID

Unnamed: 0,TotalSpending,NumTransactions,UniqueCategories,AvgTransactionValue,Books,Clothing,Electronics,Home Decor
TotalSpending,1.0,0.79472,0.593816,0.554559,0.437343,0.376038,0.457822,0.300105
NumTransactions,0.79472,1.0,0.711997,0.023322,0.553201,0.500576,0.533263,0.390397
UniqueCategories,0.593816,0.711997,1.0,0.072442,0.239388,0.410471,0.391421,0.366959
AvgTransactionValue,0.554559,0.023322,0.072442,1.0,0.016215,-0.045777,0.088475,-0.013333
Books,0.437343,0.553201,0.239388,0.016215,1.0,0.027356,0.074784,-0.010079
Clothing,0.376038,0.500576,0.410471,-0.045777,0.027356,1.0,0.086261,-0.115409
Electronics,0.457822,0.533263,0.391421,0.088475,0.074784,0.086261,1.0,-0.107737
Home Decor,0.300105,0.390397,0.366959,-0.013333,-0.010079,-0.115409,-0.107737,1.0


In [42]:
# Normalize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [48]:
# Similarity computation
similarity_matrix = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarity_matrix, index=features.index, columns=features.index)

In [49]:
# Function to get top 3 similar customers
def get_similar_customers(customer_id, similarity_df, top_n=3):
    if customer_id not in similarity_df.index:
        return f"CustomerID {customer_id} not found."
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:top_n+1]
    return similar_customers.reset_index().rename(columns={customer_id: 'SimilarityScore'})

In [50]:
# Example usage: Get top 3 similar customers for a specific CustomerID
customer_id = 'C0006'  # Replace with a valid CustomerID from your dataset
top_similar_customers = get_similar_customers(customer_id, similarity_df)

In [51]:
print(top_similar_customers)

  CustomerID  SimilarityScore
0      C0185         0.882078
1      C0196         0.850330
2      C0187         0.836320


In [None]:
# this creates a csv files where each customer id is mapped to their top 3 similar customer id with their scores 
import csv


# Open the CSV file in write mode
with open('lookalike.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # Write the header
    writer.writerow(['CustomerID', 'Lookalikes'])
    count = 0
    
    # Iterate over names and lists
    for i in customers['CustomerID']:
        # Write each name and its corresponding list (converted to a string)
        if count == 20:
            break
        writer.writerow([i, get_similar_customers(i,similarity_df).values.tolist() ] )
        count = count + 1

print("CSV file 'lookalike.csv' has been created.")


CSV file 'lookalike.csv' has been created.
