In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load the datasets
customers = pd.read_csv("C:/Users/deepa/Downloads/Customers.csv")
products = pd.read_csv("C:/Users/deepa/Downloads/Products.csv")
transactions = pd.read_csv("C:/Users/deepa/Downloads/Transactions.csv")

In [20]:
# Preprocessing data
def preprocess_data():
    # Convert date columns to datetime
    customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
    transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

    # Ensure 'Price' is calculated
    if 'Price' not in transactions.columns:
        print("Calculating 'Price' column...")
        # Replace zero or missing Quantity with a small constant to avoid division errors
        transactions['Quantity'] = transactions['Quantity'].replace(0, np.nan)
        transactions['Price'] = transactions['TotalValue'] / transactions['Quantity']
        transactions['Price'] = transactions['Price'].fillna(0)  # Handle any NaN values

    # Debugging: Check if 'Price' column exists in transactions
    print("Transactions columns after calculating 'Price':", transactions.columns)

    # Merge datasets
    merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

    # Rename Price_x for clarity
    merged_data = merged_data.rename(columns={'Price_x': 'TransactionPrice', 'Price_y': 'ProductPrice'})

    # Debugging: Check the columns in merged_data
    print("Columns in merged_data:", merged_data.columns)

    # Aggregate customer-level data
    customer_features = merged_data.groupby('CustomerID').agg({
        'TotalValue': 'sum',            # Total spending by the customer
        'Quantity': 'sum',              # Total quantity purchased
        'TransactionPrice': 'mean',     # Average transaction price
        'TransactionID': 'count'        # Number of transactions
    }).rename(columns={
        'TotalValue': 'TotalSpent',
        'Quantity': 'TotalQuantity',
        'TransactionPrice': 'AvgPrice',
        'TransactionID': 'TransactionCount'
    })

    # Debugging: Check the first few rows of customer_features
    print("Customer features after aggregation:", customer_features.head())

    # Add profile features
    customer_features = customer_features.merge(customers.set_index('CustomerID'), on='CustomerID')

    # Encode categorical data (Region)
    customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)
    
    return customer_features
customer_features = preprocess_data()

Transactions columns after calculating 'Price': Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')
Columns in merged_data: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'TransactionPrice', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'ProductPrice'],
      dtype='object')
Customer features after aggregation:             TotalSpent  TotalQuantity    AvgPrice  TransactionCount
CustomerID                                                         
C0001          3354.52             12  278.334000                 5
C0002          1862.74             10  208.920000                 4
C0003          2725.38             14  195.707500                 4
C0004          5354.88             23  240.636250                 8
C0005          2034.24              7  291.603333                 3


In [4]:
# Calculate similarity matrix
def calculate_similarity(features):
    feature_matrix = features.drop(['CustomerName', 'SignupDate'], axis=1)
    similarity_matrix = cosine_similarity(feature_matrix)
    similarity_df = pd.DataFrame(similarity_matrix, index=features.index, columns=features.index)
    return similarity_df

similarity_df = calculate_similarity(customer_features)

In [5]:
# Generate recommendations for the first 20 customers
def generate_recommendations(similarity_df, top_n=3):
    recommendations = {}
    for customer_id in similarity_df.index[:20]:
        similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False)[1:top_n+1]
        recommendations[customer_id] = list(zip(similar_customers.index, similar_customers.values))
    return recommendations

recommendations = generate_recommendations(similarity_df)

In [6]:
# Save recommendations to CSV
recommendations_list = []
for cust_id, recs in recommendations.items():
    for rec in recs:
        recommendations_list.append({
            'CustomerID': cust_id,
            'SimilarCustomerID': rec[0],
            'SimilarityScore': rec[1]
        })

recommendations_df = pd.DataFrame(recommendations_list)
recommendations_df.to_csv("C:/Users/deepa/Downloads/Data Science Assignment - eCommerce Transactions Dataset/Deepanshu_Lookalike", index=False)

In [13]:
# Insights from Task 2
print("Lookalike recommendations for the first 20 customers.\n")
for customer_id, recs in recommendations.items():
    print(f"CustomerID: {customer_id}")
    for rec in recs:
        print(f"    Lookalike: {rec[0]}, Similarity Score: {rec[1]:.4f}")

Lookalike recommendations for the first 20 customers.

CustomerID: C0001
    Lookalike: C0024, Similarity Score: 1.0000
    Lookalike: C0189, Similarity Score: 1.0000
    Lookalike: C0107, Similarity Score: 1.0000
CustomerID: C0002
    Lookalike: C0129, Similarity Score: 1.0000
    Lookalike: C0019, Similarity Score: 1.0000
    Lookalike: C0076, Similarity Score: 1.0000
CustomerID: C0003
    Lookalike: C0179, Similarity Score: 1.0000
    Lookalike: C0190, Similarity Score: 1.0000
    Lookalike: C0064, Similarity Score: 1.0000
CustomerID: C0004
    Lookalike: C0045, Similarity Score: 1.0000
    Lookalike: C0143, Similarity Score: 1.0000
    Lookalike: C0113, Similarity Score: 1.0000
CustomerID: C0005
    Lookalike: C0132, Similarity Score: 1.0000
    Lookalike: C0089, Similarity Score: 1.0000
    Lookalike: C0192, Similarity Score: 1.0000
CustomerID: C0006
    Lookalike: C0152, Similarity Score: 1.0000
    Lookalike: C0011, Similarity Score: 1.0000
    Lookalike: C0168, Similarity Score

In [15]:
# Summary Insights
summary_insights = [
    "1. The similarity model effectively identifies customers with overlapping purchase behavior and profiles.",
    "2. Customers in similar regions or with comparable purchasing trends often appear as close matches.",
    "3. Re-engagement campaigns can be tailored for similar customers to boost sales."
]

In [17]:
for i, insight in enumerate(summary_insights, 1):
    print(f"Insight {i}: {insight}")

Insight 1: 1. The similarity model effectively identifies customers with overlapping purchase behavior and profiles.
Insight 2: 2. Customers in similar regions or with comparable purchasing trends often appear as close matches.
Insight 3: 3. Re-engagement campaigns can be tailored for similar customers to boost sales.
