**Task 2: Lookalike Model**


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

In [4]:
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

In [6]:
# Convert necessary columns to datetime for processing
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])

In [7]:
# Merge datasets for holistic analysis
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")


In [8]:
# Combine customer and product information into a profile description
merged_data['ProfileDescription'] = (
    merged_data['CustomerName'] + " " +
    merged_data['Region'] + " " +
    merged_data['Category'] + " " +
    merged_data['ProductName']
)

In [10]:
# Aggregate profiles for each customer
customer_profiles = merged_data.groupby('CustomerID')['ProfileDescription'].apply(lambda x: ' '.join(x)).reset_index()# Vectorize profile descriptions using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(customer_profiles['ProfileDescription'])

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix)

In [11]:
# Build lookalike recommendations for the first 20 customers
customer_ids = customer_profiles['CustomerID']
lookalike_results = {}

In [12]:
for i, customer_id in enumerate(customer_ids[:20]):  # Limit to first 20 customers
    similarity_scores = list(enumerate(similarity_matrix[i]))
    # Sort by similarity score, excluding self-comparison
    similar_customers = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]
    # Map customer to top 3 similar customers with scores
    lookalike_results[customer_id] = [(customer_ids[idx], score) for idx, score in similar_customers]

In [13]:
# Create a DataFrame for the lookalike results
lookalike_df = pd.DataFrame([
    {"cust_id": cust_id, "lookalikes": str(lookalikes)}
    for cust_id, lookalikes in lookalike_results.items()
])

In [14]:
lookalike_csv_path = 'Lookalike.csv'
lookalike_df.to_csv(lookalike_csv_path, index=False)