In [22]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np

In [23]:
# Loading the datasets:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [24]:
# Creating customer profile based on transactions:
customer_profiles = transactions.groupby('CustomerID').agg({
    'Price': 'mean',         
    'Quantity': 'sum',        
    'TotalValue': 'sum'       
}).reset_index()

In [25]:
region_dummies = pd.get_dummies(customers[['CustomerID', 'Region']], columns=['Region'])
customer_profiles = customer_profiles.merge(region_dummies, on='CustomerID', how='left')


In [26]:
# Standardizing the numerical features:
scaler = StandardScaler()
num_features = ['Price', 'Quantity', 'TotalValue']
customer_profiles[num_features] = scaler.fit_transform(customer_profiles[num_features])

In [27]:
# Computing similarity matrix:
similarity_matrix = cosine_similarity(customer_profiles.drop('CustomerID', axis=1))

# Applying a threshold for similarity:
SIMILARITY_THRESHOLD = 0.9

In [28]:
# Recommending top 3 similar customers for the first 20 customers
lookalike_map = {}
for idx in range(20):
    customer_id = customer_profiles.iloc[idx]['CustomerID']
    similarities = list(enumerate(similarity_matrix[idx]))
    similarities = [(i, score) for i, score in similarities if score > SIMILARITY_THRESHOLD]
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_profiles.iloc[i]['CustomerID'], score) for i, score in similarities[1:4]]
    lookalike_map[customer_id] = top_3

In [29]:
# Saving Lookalike.csv:
lookalike_data = {'CustomerID': [], 'Lookalikes': []}
for cust_id, lookalikes in lookalike_map.items():
    lookalike_data['CustomerID'].append(cust_id)
    lookalike_data['Lookalikes'].append(str(lookalikes))

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Lookalike.csv', index=False)