In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Display the first few rows of each dataset
print("Customers Dataset")
print(customers.head())

print("\nProducts Dataset")
print(products.head())

print("\nTransactions Dataset")
print(transactions.head())


Customers Dataset
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Products Dataset
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Transactions Dataset
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      

In [3]:
# Merge transactions with customers and products
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Create aggregated features for customers
customer_profiles = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total revenue generated by the customer
    'Quantity': 'sum',    # Total quantity of products bought
    'Category': lambda x: x.mode()[0],  # Most common product category purchased
    'Region': 'first',    # Customer's region
}).reset_index()

print("Customer Profiles:")
print(customer_profiles.head())

Customer Profiles:
  CustomerID  TotalValue  Quantity     Category         Region
0      C0001     3354.52        12  Electronics  South America
1      C0002     1862.74        10     Clothing           Asia
2      C0003     2725.38        14   Home Decor  South America
3      C0004     5354.88        23        Books  South America
4      C0005     2034.24         7  Electronics           Asia


In [4]:
# One-hot encode 'Region' and 'Category'
customer_profiles_encoded = pd.get_dummies(customer_profiles, columns=['Region', 'Category'], drop_first=True)

print("Encoded Customer Profiles:")
print(customer_profiles_encoded.head())


Encoded Customer Profiles:
  CustomerID  TotalValue  Quantity  Region_Europe  Region_North America  \
0      C0001     3354.52        12              0                     0   
1      C0002     1862.74        10              0                     0   
2      C0003     2725.38        14              0                     0   
3      C0004     5354.88        23              0                     0   
4      C0005     2034.24         7              0                     0   

   Region_South America  Category_Clothing  Category_Electronics  \
0                     1                  0                     1   
1                     0                  1                     0   
2                     1                  0                     0   
3                     1                  0                     0   
4                     0                  0                     1   

   Category_Home Decor  
0                    0  
1                    0  
2                    1  
3            

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(customer_profiles_encoded.drop(columns=['CustomerID']))

# Find top 3 similar customers for each customer
similar_customers = {}
for idx, customer_id in enumerate(customer_profiles['CustomerID']):
    # Get similarity scores for the current customer
    similar_scores = list(enumerate(similarity_matrix[idx]))
    # Sort by score (descending) and exclude the customer itself
    similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True)[1:4]
    # Store the top 3 similar customers with their scores
    similar_customers[customer_id] = [(customer_profiles['CustomerID'][i], score) for i, score in similar_scores]

# Convert to DataFrame for Lookalike.csv
lookalike_df = pd.DataFrame([
    {'cust_id': cust_id, 'similar_customers': str(similarities)}
    for cust_id, similarities in similar_customers.items()
])

# Save as CSV
lookalike_df.to_csv("MadhuSudhanReddy_Takkoli_Lookalike.csv", index=False)
print("Lookalike CSV Created!")


Lookalike CSV Created!


In [6]:
lookalike_df_filtered = lookalike_df[lookalike_df['cust_id'].isin(customer_profiles['CustomerID'][:20])]

# Expand display options to show full output
pd.set_option('display.max_colwidth', None)  # No truncation for column width
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

print("Filtered Lookalike Recommendations for First 20 Customers:")
print(lookalike_df_filtered)

Filtered Lookalike Recommendations for First 20 Customers:
   cust_id  \
0    C0001   
1    C0002   
2    C0003   
3    C0004   
4    C0005   
5    C0006   
6    C0007   
7    C0008   
8    C0009   
9    C0010   
10   C0011   
11   C0012   
12   C0013   
13   C0014   
14   C0015   
15   C0016   
16   C0017   
17   C0018   
18   C0019   
19   C0020   

                                                                                similar_customers  
0   [('C0120', 0.9999999863980915), ('C0102', 0.9999999700909558), ('C0153', 0.9999999517366813)]  
1    [('C0034', 0.9999999113804374), ('C0176', 0.999999877736369), ('C0030', 0.9999998762472673)]  
2    [('C0031', 0.9999999713493515), ('C0025', 0.999999925816807), ('C0076', 0.9999999071299613)]  
3   [('C0169', 0.9999999926262992), ('C0165', 0.9999999836163164), ('C0174', 0.9999999767394253)]  
4   [('C0146', 0.9999999405460502), ('C0028', 0.9999999374492478), ('C0007', 0.9999999367645143)]  
5   [('C0126', 0.9999999998071625), ('C0171', 