In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [4]:
customers = pd.read_csv("Customers.csv")

In [5]:
products = pd.read_csv("Products.csv")

In [6]:
transactions = pd.read_csv("Transactions.csv")

In [7]:

customer_data = transactions.merge(customers[['CustomerID', 'Region']], on='CustomerID', how='left')
customer_data = customer_data.merge(products[['ProductID', 'Category']], on='ProductID', how='left')

In [8]:
customer_profile = customer_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Region': 'first'
}).reset_index()

In [9]:
customer_profile['Region'] = customer_profile['Region'].astype('category').cat.codes

In [10]:
X = customer_profile.drop(columns='CustomerID')

In [11]:
cosine_sim = cosine_similarity(X)

In [12]:
def get_top_lookalikes(cust_id, cosine_sim_matrix, n=3):
    cust_index = customer_profile[customer_profile['CustomerID'] == cust_id].index[0]

    sim_scores = list(enumerate(cosine_sim_matrix[cust_index]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = [score for score in sim_scores if score[0] != cust_index]

    top_similar_customers = sim_scores[:n]

    similar_customers = [(customer_profile.iloc[i[0]]['CustomerID'], i[1]) for i in top_similar_customers]

    return similar_customers

In [13]:
lookalikes = {}
for i in range(1, 21):
    cust_id = f'C{str(i).zfill(4)}'
    lookalikes[cust_id] = get_top_lookalikes(cust_id, cosine_sim)


lookalike_df = []
for cust_id, similar_customers in lookalikes.items():
    for similar_customer in similar_customers:
        lookalike_df.append([cust_id, similar_customer[0], similar_customer[1]])

lookalike_df = pd.DataFrame(lookalike_df, columns=['CustomerID', 'LookalikeID', 'SimilarityScore'])
lookalike_df.to_csv('Lookalike.csv', index=False)

print(lookalike_df.head())

  CustomerID LookalikeID  SimilarityScore
0      C0001       C0011              1.0
1      C0001       C0131              1.0
2      C0001       C0191              1.0
3      C0002       C0043              1.0
4      C0002       C0142              1.0


In [14]:

lookalikes_df = pd.DataFrame(

    [(k, v[0][0], v[0][1], v[1][0], v[1][1], v[2][0], v[2][1]) for k, v in lookalikes.items()],

    columns=['CustomerID', 'Lookalike1', 'Score1', 'Lookalike2', 'Score2', 'Lookalike3', 'Score3']

)



lookalikes_df.to_csv('Lookalike.csv', index=False)

print("Lookalike results saved to Lookalike.csv")

Lookalike results saved to Lookalike.csv


In [16]:
pt = pd.read_csv("Lookalike.csv")

In [17]:
pt

Unnamed: 0,CustomerID,Lookalike1,Score1,Lookalike2,Score2,Lookalike3,Score3
0,C0001,C0011,1.0,C0131,1.0,C0191,1.0
1,C0002,C0043,1.0,C0142,1.0,C0136,1.0
2,C0003,C0157,1.0,C0176,1.0,C0119,1.0
3,C0004,C0132,1.0,C0072,1.0,C0169,1.0
4,C0005,C0173,1.0,C0145,1.0,C0143,1.0
5,C0006,C0126,1.0,C0171,1.0,C0118,1.0
6,C0007,C0146,1.0,C0021,1.0,C0138,1.0
7,C0008,C0197,1.0,C0086,1.0,C0166,1.0
8,C0009,C0198,1.0,C0137,1.0,C0184,1.0
9,C0010,C0049,1.0,C0097,1.0,C0091,1.0
