In [1]:
## Step 1: Import Necessary Libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
## Step 2: Load Datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
## Step 3: Preprocess and Clean Data
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [4]:
# Merge transactions with product information
transactions = transactions.merge(products, on='ProductID', how='left')

In [5]:
# Aggregate transaction data by customer
customer_profiles = transactions.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],  # Total and average spending
    'ProductID': 'count',          # Number of transactions
    'Category': lambda x: x.mode()[0],  # Most purchased category
}).reset_index()
customer_profiles.columns = ['CustomerID', 'TotalSpending', 'AverageSpending', 'TransactionCount', 'FavoriteCategory']

In [6]:
# Merge customer profiles with customer information
customer_profiles = customers.merge(customer_profiles, on='CustomerID', how='left')

In [7]:
## Step 4: Feature Engineering
# One-hot encode categorical variables (Region, FavoriteCategory)
encoder = OneHotEncoder(sparse_output=False)
categorical_data = encoder.fit_transform(customer_profiles[['Region', 'FavoriteCategory']])

In [8]:
# Combine numerical and categorical features
numerical_data = customer_profiles[['TotalSpending', 'AverageSpending', 'TransactionCount']].fillna(0)
features = pd.concat([numerical_data, pd.DataFrame(categorical_data)], axis=1)

In [9]:
## Step 5: Compute Similarities
# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(features)

In [10]:
## Step 6: Generate Lookalike Recommendations
# Create a dictionary to store lookalike data
lookalike_map = {}
# Map CustomerID to matrix indices
customer_ids = customer_profiles['CustomerID']

In [11]:
for i, customer_id in enumerate(customer_ids):
    # Get similarity scores for the current customer
    similarities = list(enumerate(similarity_matrix[i]))
    # Sort by similarity score (excluding self-similarity)
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    top_lookalikes = [
        (customer_ids[j], score) for j, score in similarities[1:4]  # Top 3 lookalikes
    ]
    lookalike_map[customer_id] = top_lookalikes

In [12]:
# Generate output for customers C0001 to C0020
output = []
for customer_id in customer_ids[:20]:
    lookalikes = lookalike_map[customer_id]
    row = [customer_id]
    for lookalike, score in lookalikes:
        row.extend([lookalike, score])
    output.append(row)

columns = ['CustomerID', 
           'SimilarCustomerID1', 'Score1', 
           'SimilarCustomerID2', 'Score2', 
           'SimilarCustomerID3', 'Score3']
## Step 7: Save Results

# Save lookalike results to CSV
lookalike_df = pd.DataFrame(output, columns=columns)
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike.csv generated successfully.")

Lookalike.csv generated successfully.
