In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv('C:\\Users\\Administrator\\Data Science\\Customers.csv')
products = pd.read_csv('C:\\Users\\Administrator\\Data Science\\Products.csv')
transactions = pd.read_csv('C:\\Users\\Administrator\\Data Science\\Transactions.csv')

In [2]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])


In [3]:
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')


In [4]:
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean', 'count'],  # Total, Average Spending, and Transaction Count
    'Quantity': 'sum',                      # Total Quantity Purchased
    'Category': lambda x: x.mode()[0]       # Most Purchased Category
}).reset_index()

# Rename columns for clarity
customer_features.columns = [
    'CustomerID', 'TotalSpending', 'AverageSpending', 'TransactionCount', 
    'TotalQuantity', 'MostPurchasedCategory'
]

In [5]:
customer_profiles = customers.merge(customer_features, on='CustomerID')

In [6]:
#Step 2: Encode categorical variables (Region, MostPurchasedCategory)
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region', 'MostPurchasedCategory'], drop_first=True)

In [7]:
# Step 3: Normalize numerical features
numerical_features = ['TotalSpending', 'AverageSpending', 'TransactionCount', 'TotalQuantity']
scaler = StandardScaler()
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])

In [8]:
# Step 4: Compute pairwise similarity using Cosine Similarity
customer_similarity = cosine_similarity(customer_profiles[numerical_features])
customer_similarity_df = pd.DataFrame(
    customer_similarity, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID']
)

In [9]:
# Step 5: Find top 3 similar customers for each target customer (C0001 - C0020)
target_customers = customer_profiles['CustomerID'][:20]
lookalike_map = {}

for customer_id in target_customers:
    # Get similarity scores for the target customer
    similar_customers = customer_similarity_df[customer_id].sort_values(ascending=False)
    similar_customers = similar_customers[similar_customers.index != customer_id]  # Exclude the customer itself
    top_similar_customers = similar_customers.head(3).reset_index()
    lookalike_map[customer_id] = top_similar_customers.values.tolist()

In [11]:
# Step 6: Save the results to a CSV file
lookalike_data = []

for cust_id, lookalikes in lookalike_map.items():
    for lookalike in lookalikes:
        lookalike_data.append([cust_id, lookalike[0], lookalike[1]])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeID', 'SimilarityScore'])
lookalike_df.to_csv('C:\\Users\\Administrator\\Data Science\\Lookalike.csv', index=False)


In [12]:
print("Lookalike model output saved to Lookalike.csv")

Lookalike model output saved to Lookalike.csv
