In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


In [7]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])


In [8]:
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

In [9]:
customer_profiles = merged_data.groupby('CustomerID').agg({
    'Category': lambda x: ' '.join(x),  # Combine product categories as a single string
    'TotalValue': 'sum',  # Total spending
    'Region': 'first'  # Region information
}).reset_index()

In [10]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(customer_profiles['Category'])

In [11]:
# Add spending and region information to the vector
customer_profiles['RegionEncoded'] = customer_profiles['Region'].astype('category').cat.codes
additional_features = customer_profiles[['TotalValue', 'RegionEncoded']].values

In [12]:
# Combine vectors and calculate similarity scores
similarity_matrix = cosine_similarity(
    np.hstack([tfidf_matrix.toarray(), additional_features])
)

In [13]:
lookalikes = {}
for idx in range(20):  # First 20 customers
    customer_id = customer_profiles.iloc[idx]['CustomerID']
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]
    lookalikes[customer_id] = [(customer_profiles.iloc[i]['CustomerID'], score) for i, score in sorted_scores]