In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load datasets
customers = pd.read_csv(r'downloads\Customers.csv')
products = pd.read_csv(r'downloads\Products.csv')
transactions = pd.read_csv(r'downloads\Transactions.csv')


In [3]:
# Merge customer and transaction data on 'CustomerID'
customer_transactions = pd.merge(transactions, customers, on='CustomerID')


In [5]:
# Merge the transaction data with product information on 'ProductID'
full_data = pd.merge(customer_transactions, products, on='ProductID')


In [6]:
# One-hot encoding the 'Region' column to create numerical features
full_data = pd.get_dummies(full_data, columns=['Region'], drop_first=True)


In [7]:
# Grouping by CustomerID to create aggregated features
customer_features = full_data.groupby('CustomerID').agg(
    total_transactions=('TransactionID', 'count'),
    total_spent=('TotalValue', 'sum'),
    most_purchased_category=('Category', lambda x: x.mode()[0])  # Most frequent product category
).reset_index()


In [8]:
# Frequency of product categories purchased by each customer
category_freq = full_data.groupby(['CustomerID', 'Category']).size().unstack(fill_value=0)
category_freq.reset_index(inplace=True)


In [9]:
# Calculate average price per transaction for each customer
customer_features['avg_spent_per_transaction'] = customer_features['total_spent'] / customer_features['total_transactions']


In [11]:
# Select relevant features for similarity calculation
features_for_similarity = customer_features[['total_transactions', 'total_spent', 'avg_spent_per_transaction']]

# Normalize the features (optional, but helps with similarity calculations)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_for_similarity)


In [12]:
# Compute cosine similarity matrix
cosine_similarities = cosine_similarity(features_scaled)

# Create a DataFrame for easier analysis
cosine_similarity_df = pd.DataFrame(cosine_similarities, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


In [13]:
# Function to get the top 3 similar customers for each customer
def get_top_3_similar(customers, similarity_matrix):
    lookalike_map = {}
    for customer in customers:
        # Get similarity scores for the current customer, excluding the customer itself
        similarity_scores = similarity_matrix[customer].drop(customer)
        
        # Get top 3 similar customers based on highest similarity scores
        top_3 = similarity_scores.nlargest(3)
        
        # Store the top 3 similar customers and their similarity scores
        lookalike_map[customer] = list(zip(top_3.index, top_3.values))
    
    return lookalike_map

# Get the top 3 similar customers for the first 20 customers
lookalike_map = get_top_3_similar(customer_features['CustomerID'][:20], cosine_similarity_df)


In [14]:
# Convert the lookalike map to a DataFrame and save to CSV
lookalike_data = []

for customer, similar_customers in lookalike_map.items():
    for similar_customer, score in similar_customers:
        lookalike_data.append([customer, similar_customer, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])
lookalike_df.to_csv('NagaDeekshitha_Gollamandula_Lookalike.csv', index=False)
