In [1]:
# Import the required Libraries.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the ecommerce dataset (customers.csv, products.csv, transactions.csv)


customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

In [3]:
# first 5 records all three csv files.

customers_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [4]:
products_df.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [5]:
transactions_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [6]:
# Merge customers and transactions

customer_transactions = pd.merge(customers_df, transactions_df, on='CustomerID')

In [7]:
customer_transactions.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TransactionID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,C0001,Lawrence Carroll,South America,2022-07-10,T00015,P054,2024-01-19 03:12:55,2,114.6,57.3
1,C0001,Lawrence Carroll,South America,2022-07-10,T00932,P022,2024-09-17 09:01:18,3,412.62,137.54
2,C0001,Lawrence Carroll,South America,2022-07-10,T00085,P096,2024-04-08 00:01:00,2,614.94,307.47
3,C0001,Lawrence Carroll,South America,2022-07-10,T00445,P083,2024-05-07 03:11:44,2,911.44,455.72
4,C0001,Lawrence Carroll,South America,2022-07-10,T00436,P029,2024-11-02 17:04:16,3,1300.92,433.64


In [8]:
# Feature Engineering

## Creating features that represent behaviour of customers.

customer_transactions['TotalSpend'] = customer_transactions.groupby('CustomerID')['TotalValue'].transform('sum')
customer_transactions['AvgTransactionValue'] =  customer_transactions.groupby('CustomerID')['TotalValue'].transform('mean')
customer_transactions['NumTransactions'] = customer_transactions.groupby('CustomerID')['TransactionID'].transform('count')


In [9]:
# Select relevant features
features = customer_transactions[['CustomerID', 'Region', 'TotalSpend', 'AvgTransactionValue', 'NumTransactions']]
features = features.drop_duplicates()

In [10]:
features.head()

Unnamed: 0,CustomerID,Region,TotalSpend,AvgTransactionValue,NumTransactions
0,C0001,South America,3354.52,670.904,5
5,C0002,Asia,1862.74,465.685,4
9,C0003,South America,2725.38,681.345,4
13,C0004,South America,5354.88,669.36,8
21,C0005,Asia,2034.24,678.08,3


In [11]:
# installing the required library.
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable


In [12]:
# Data Preprocessing

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['TotalSpend', 'AvgTransactionValue', 'NumTransactions']),
        ('cat', OneHotEncoder(), ['Region'])
    ])

# Fit and transform the data
features_processed = preprocessor.fit_transform(features.drop(columns='CustomerID'))

# Display the shape of the processed features
print(features_processed.shape)


(199, 7)


In [13]:
# Similarity measurement

from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity
similarity_matrix = cosine_similarity(features_processed)

# Display the shape of the similarity matrix
print(similarity_matrix.shape)


(199, 199)


In [14]:
# Model Development

# Getting top 3 lookalike 
def get_top_lookalikes(customer_id, similarity_matrix, features, top_n=3):
    
    idx = features[features['CustomerID'] == customer_id].index[0]
    similar_indices = similarity_matrix[idx].argsort()[-(top_n+2):-1]
    similar_customers = features.iloc[similar_indices]
    similarity_scores = similarity_matrix[idx][similar_indices]
    return similar_customers['CustomerID'].tolist(), similarity_scores.tolist()

# Generate lookalike recommendations for the first 20 customers
lookalike_results = {}
for customer_id in customers_df['CustomerID'][:20]:
    lookalikes, scores = get_top_lookalikes(customer_id, similarity_matrix, features)
    lookalike_results[customer_id] = list(zip(lookalikes, scores))

# Display the lookalike results
print(lookalike_results)


{'C0001': [('C0191', 0.9456659972710266), ('C0107', 0.9641690971679809), ('C0152', 0.9995103764905028), ('C0137', 0.9997616475513846)], 'C0002': [('C0187', 0.9330226466430889), ('C0171', 0.9387564178572098), ('C0158', 0.9711446209691191), ('C0168', 0.978264001321209)], 'C0003': [('C0111', 0.961936676891598), ('C0121', 0.9758346484079298), ('C0009', 0.9807961382416003), ('C0199', 0.9923026397421746)], 'C0004': [('C0062', 0.9552508669407361), ('C0009', 0.9615398807230746), ('C0060', 0.9756975141606177), ('C0198', 0.9889759898914686)], 'C0005': [('C0028', 0.8904704810597641), ('C0143', 0.9374958194515685), ('C0045', 0.938694631424901), ('C0136', 0.9739289643466053)], 'C0006': [('C0083', 0.9806376167577927), ('C0032', 0.9813844059285817), ('C0031', 0.9823592789671766), ('C0077', 0.9843121639383412)], 'C0007': [('C0189', 0.9286693503994602), ('C0030', 0.940253111053496), ('C0157', 0.956079123844235), ('C0094', 0.9788713351248212)], 'C0008': [('C0151', 0.9683568355953214), ('C0025', 0.981384

In [15]:
# Flatten the lookalike_results dictionary for DataFrame creation

flattened_results = []
for customer_id, lookalikes in lookalike_results.items():
    for lookalike, score in lookalikes:
        flattened_results.append([customer_id, lookalike, score])

In [16]:
# Create a DataFrame from the flattened results
lookalike_df = pd.DataFrame(flattened_results, columns=['CustomerID', 'LookalikeID', 'SimilarityScore'])

In [17]:
# Save to CSV file 
lookalike_df.to_csv('Gopal_Kate_Lookalike.csv', index=False)

In [18]:
lookalike_df.head()

Unnamed: 0,CustomerID,LookalikeID,SimilarityScore
0,C0001,C0191,0.945666
1,C0001,C0107,0.964169
2,C0001,C0152,0.99951
3,C0001,C0137,0.999762
4,C0002,C0187,0.933023
