In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers_file = 'Customers.csv'
transactions_file = 'Transactions.csv'
products_file = 'Products.csv'

customers_df = pd.read_csv(customers_file)
transactions_df = pd.read_csv(transactions_file)
products_df = pd.read_csv(products_file)

# Preprocess dates
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

# Merge transactions with products to include product details
transactions_merged = transactions_df.merge(products_df, on='ProductID')

# Aggregate customer transaction data
customer_transactions = transactions_merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum',  # Total quantity purchased
    'Category': lambda x: ' '.join(x)  # Combine categories for TF-IDF
}).reset_index()

# Merge customer transaction data with customer profiles
customers_combined = customers_df.merge(customer_transactions, on='CustomerID', how='left')

# Handle missing values (if any)
customers_combined.fillna({'TotalValue': 0, 'Quantity': 0, 'Category': ''}, inplace=True)

# TF-IDF encoding for product categories
tfidf = TfidfVectorizer()
category_tfidf = tfidf.fit_transform(customers_combined['Category'])

# Create a feature set combining numerical and categorical data
features = pd.DataFrame(category_tfidf.toarray(), index=customers_combined.index)
features['TotalValue'] = customers_combined['TotalValue']
features['Quantity'] = customers_combined['Quantity']

# Normalize numerical features
features[['TotalValue', 'Quantity']] = (features[['TotalValue', 'Quantity']] - 
                                        features[['TotalValue', 'Quantity']].min()) / (
                                        features[['TotalValue', 'Quantity']].max() - 
                                        features[['TotalValue', 'Quantity']].min())

# Compute similarity scores using cosine similarity
similarity_matrix = cosine_similarity(features)

# Generate recommendations for the first 20 customers
lookalikes = {}
for idx, customer_id in enumerate(customers_combined['CustomerID'][:20]):
    customer_index = customers_combined.index[customers_combined['CustomerID'] == customer_id][0]
    similarity_scores = list(enumerate(similarity_matrix[customer_index]))
    # Sort by similarity score, excluding the customer itself
    similar_customers = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]
    lookalikes[customer_id] = [(customers_combined['CustomerID'][i], round(score, 3)) 
                                for i, score in similar_customers]

# Convert lookalikes to DataFrame and save as CSV
lookalikes_df = pd.DataFrame([
    {'CustomerID': customer, 'Lookalikes': lookalike_list} 
    for customer, lookalike_list in lookalikes.items()
])
lookalikes_df.to_csv('Lookalike.csv', index=False)

# Output
print(lookalikes_df.head())


  CustomerID                                        Lookalikes
0      C0001  [(C0035, 0.982), (C0146, 0.977), (C0069, 0.963)]
1      C0002   [(C0144, 0.997), (C0133, 0.996), (C0134, 0.97)]
2      C0003  [(C0166, 0.997), (C0031, 0.993), (C0158, 0.987)]
3      C0004  [(C0113, 0.981), (C0017, 0.972), (C0041, 0.969)]
4      C0005   [(C0007, 0.999), (C0197, 0.998), (C0069, 0.96)]
