In [23]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from datetime import datetime

customers = pd.read_csv(r"C:\Users\xdham\Downloads\Customers.csv")
products = pd.read_csv(r"C:\Users\xdham\Downloads\Products.csv")
transactions = pd.read_csv(r"C:\Users\xdham\Downloads\Transactions.csv")

merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

if 'Price' not in merged_data.columns:
    merged_data['Price'] = merged_data['TotalValue'] / merged_data['Quantity']

merged_data['SignupDate'] = pd.to_datetime(merged_data['SignupDate'])
merged_data['TenureDays'] = (datetime.now() - merged_data['SignupDate']).dt.days

customer_profiles = merged_data.groupby('CustomerID').agg({
    'Region': 'first',
    'TenureDays': 'first',
    'Category': lambda x: ' '.join(x),
    'Price': 'mean',
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()

customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'], drop_first=True)

scaler = StandardScaler()
numerical_cols = ['Price', 'TotalValue', 'Quantity', 'TenureDays']
customer_profiles[numerical_cols] = scaler.fit_transform(customer_profiles[numerical_cols])

vectorizer = CountVectorizer()
category_features = vectorizer.fit_transform(customer_profiles['Category'])

final_features = np.hstack([customer_profiles.drop(['CustomerID', 'Category'], axis=1).values, category_features.toarray()])

similarity_matrix = cosine_similarity(final_features)

def recommend_similar_customers(input_customer_id, n=3):
    if input_customer_id not in customer_profiles['CustomerID'].values:
        raise ValueError(f"CustomerID {input_customer_id} not found.")
    
    customer_idx = customer_profiles[customer_profiles['CustomerID'] == input_customer_id].index[0]
    similarity_scores = similarity_matrix[customer_idx]
    
    similar_customers_idx = np.argsort(similarity_scores)[::-1][1:n+1]
    similar_customers = customer_profiles.iloc[similar_customers_idx]
    
    similar_customers_map = [(row['CustomerID'], similarity_scores[idx]) for idx, row in similar_customers.iterrows()]
    
    return similar_customers_map

lookalike_dict = {}
for customer_id in [f'C{i:04d}' for i in range(1, 21)]:
    try:
        similar_customers = recommend_similar_customers(customer_id, n=3)
        lookalike_dict[customer_id] = similar_customers
    except ValueError as e:
        print(e)

lookalike_data = []
for customer_id, similar_customers in lookalike_dict.items():
    for cust_id, score in similar_customers:
        lookalike_data.append([customer_id, cust_id, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'SimilarCustomerID', 'SimilarityScore'])

lookalike_df.to_csv('Lookalike.csv', index=False)

print(lookalike_df.head())



  CustomerID SimilarCustomerID  SimilarityScore
0      C0001             C0118         0.878134
1      C0001             C0184         0.870396
2      C0001             C0120         0.854405
3      C0002             C0159         0.954488
4      C0002             C0134         0.945575
