In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import csv

# Load the datasets
customers = pd.read_csv(r'/home/Customers.csv')
products = pd.read_csv(r'/home/Products.csv')
transactions = pd.read_csv(r'/home/Transactions.csv')

# Preprocess data
merged_customers = transactions.merge(products, on='ProductID').merge(customers, on='CustomerID')

# Aggregate customer-product relationship
customer_profiles = merged_customers.groupby('CustomerID')['ProductName'].apply(lambda x: ' '.join(x)).reset_index()

# Create a TF-IDF matrix for customer-product relationship
tfidf = TfidfVectorizer()
customer_tfidf = tfidf.fit_transform(customer_profiles['ProductName'])

# Calculate cosine similarity
similarity_matrix = cosine_similarity(customer_tfidf)

# Create the Lookalike file
lookalike_dict = {}
for idx in range(20):  # For CustomerIDs: C0001 - C0020
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: -x[1])[1:4]  # Top 3 lookalikes (excluding self)
    lookalikes = [(customer_profiles.loc[i, 'CustomerID'], round(score, 2)) for i, score in sim_scores]
    lookalike_dict[customer_profiles.loc[idx, 'CustomerID']] = lookalikes

# Save to CSV
with open('Lookalike.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['CustomerID', 'Lookalikes'])
    for cust_id, lookalikes in lookalike_dict.items():
        writer.writerow([cust_id, lookalikes])
