In [21]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics import accuracy_score


In [22]:
customers = pd.read_csv(r"C:\Users\khira\Downloads\Customers.csv")
transactions = pd.read_csv(r"C:\Users\khira\Downloads\Transactions.csv")
products = pd.read_csv(r"C:\Users\khira\Downloads\Products.csv")

In [23]:
transactions = transactions.merge(customers, on='CustomerID', how='left')
transactions = transactions.merge(products, on='ProductID', how='left')

In [24]:
if not pd.api.types.is_numeric_dtype(transactions['TotalValue']):
    transactions['TotalValue'] = pd.to_numeric(transactions['TotalValue'], errors='coerce')

In [25]:
if 'Price' not in transactions.columns or not pd.api.types.is_numeric_dtype(transactions['Price']):
    transactions['Price'] = 0  # Default to 0 if missing or non-numeric

In [26]:
customer_features = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum',  # Total products purchased
    'Price': 'mean',  # Average price of products purchased
    'Category': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'  # Most frequent category
}).reset_index()

In [27]:
customer_features = pd.get_dummies(customer_features, columns=['Category'], drop_first=True)

In [28]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))

In [29]:
similarity_matrix = cosine_similarity(scaled_features)

In [30]:
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

In [31]:
lookalikes = {}
for customer_id in customer_features['CustomerID']:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Exclude self (highest similarity)
    lookalikes[customer_id] = [(similar_id, round(score, 4)) for similar_id, score in similar_customers.items()]


In [32]:
lookalike_data = []
for customer_id, similar_list in lookalikes.items():
    for similar_id, score in similar_list:
        lookalike_data.append({'CustomerID': customer_id, 'SimilarCustomerID': similar_id, 'SimilarityScore': score})


In [33]:
lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("FirstName_LastName_Lookalike.csv", index=False)

In [34]:
first_20_lookalikes = lookalike_df[lookalike_df['CustomerID'].isin(customers['CustomerID'][:20])]
print(first_20_lookalikes)

   CustomerID SimilarCustomerID  SimilarityScore
0       C0001             C0026           0.9986
1       C0001             C0184           0.9967
2       C0001             C0127           0.9962
3       C0002             C0029           0.9999
4       C0002             C0088           0.9980
5       C0002             C0129           0.9905
6       C0003             C0160           0.9987
7       C0003             C0086           0.9970
8       C0003             C0038           0.9970
9       C0004             C0175           0.9936
10      C0004             C0017           0.9923
11      C0004             C0075           0.9905
12      C0005             C0186           0.9983
13      C0005             C0192           0.9979
14      C0005             C0112           0.9976
15      C0006             C0117           0.9948
16      C0006             C0064           0.9823
17      C0006             C0187           0.9772
18      C0007             C0146           1.0000
19      C0007       

In [35]:
print("Lookalike Model completed. Results saved to 'FirstName_LastName_Lookalike.csv'.")

Lookalike Model completed. Results saved to 'FirstName_LastName_Lookalike.csv'.
