In [23]:
# Load the Data

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder

# Load the datasets
customers_df = pd.read_csv("Customers.csv")
transactions_df = pd.read_csv("Transactions.csv")
products_df = pd.read_csv("Products.csv")


In [13]:
# Display the first few rows of each dataset to understand its structure
print("Customers Data:")
print(customers_df.head())
print("\nTransactions Data:")
print(transactions_df.head())
print("\nProducts Data:")
print(products_df.head())


Customers Data:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Transactions Data:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price  
0      300.68  300.68  
1      300.68  300.68  
2      300.68  300.68  
3      601.36  300.68  
4      902.04  300.68  

Products Data:
  ProductID      

In [24]:
# Data Preprocessing: Summarize customer transactions
customer_summary = transactions_df.groupby('CustomerID').agg(
    total_transactions=('TransactionID', 'count'),
    total_spent=('TotalValue', 'sum'),
    avg_spent=('TotalValue', 'mean')
).reset_index()

print("Customer Summary DataFrame:")
print(customer_summary.head())



Customer Summary DataFrame:
  CustomerID  total_transactions  total_spent  avg_spent
0      C0001                   5      3354.52    670.904
1      C0002                   4      1862.74    465.685
2      C0003                   4      2725.38    681.345
3      C0004                   8      5354.88    669.360
4      C0005                   3      2034.24    678.080


In [25]:
# Merge customer summary with customer profile (including Region)
merged_data = pd.merge(customer_summary, customers_df[['CustomerID', 'Region']], on='CustomerID', how='left')

print("Merged Data (Customer Profile + Transaction Summary):")
print(merged_data.head())



Merged Data (Customer Profile + Transaction Summary):
  CustomerID  total_transactions  total_spent  avg_spent         Region
0      C0001                   5      3354.52    670.904  South America
1      C0002                   4      1862.74    465.685           Asia
2      C0003                   4      2725.38    681.345  South America
3      C0004                   8      5354.88    669.360  South America
4      C0005                   3      2034.24    678.080           Asia


In [26]:
# Encoding 'Region' using LabelEncoder
le = LabelEncoder()
merged_data['Region_encoded'] = le.fit_transform(merged_data['Region'])

print("Encoded Region Data:")
print(merged_data[['CustomerID', 'Region', 'Region_encoded']].head())

# Create a customer profile vector: [Region_encoded, total_spent, avg_spent]
merged_data['profile_vector'] = merged_data[['Region_encoded', 'total_spent', 'avg_spent']].apply(list, axis=1)


Encoded Region Data:
  CustomerID         Region  Region_encoded
0      C0001  South America               3
1      C0002           Asia               0
2      C0003  South America               3
3      C0004  South America               3
4      C0005           Asia               0


In [27]:
# Calculate cosine similarity between customer profiles
customer_profiles = merged_data['profile_vector'].apply(pd.Series)

print("Customer Profiles DataFrame:")
print(customer_profiles.head())

similarity_matrix = cosine_similarity(customer_profiles)

print("Cosine Similarity Matrix:")
print(similarity_matrix[:5, :5])  # Print the first 5x5 block of the matrix



Customer Profiles DataFrame:
     0        1        2
0  3.0  3354.52  670.904
1  0.0  1862.74  465.685
2  3.0  2725.38  681.345
3  3.0  5354.88  669.360
4  0.0  2034.24  678.080
Cosine Similarity Matrix:
[[1.         0.99886775 0.99886812 0.99733367 0.9922775 ]
 [0.99886775 1.         0.99999943 0.99273363 0.99705449]
 [0.99886812 0.99999943 1.         0.99273366 0.99705392]
 [0.99733367 0.99273363 0.99273366 1.         0.98058052]
 [0.9922775  0.99705449 0.99705392 0.98058052 1.        ]]


In [30]:
# Create a list of recommended customers and similarity scores
recommendations_list = []
for customer, recommendations in lookalike_recommendations.items():
    customer_ids = [rec[0] for rec in recommendations]
    similarity_scores = [rec[1] for rec in recommendations]
    recommendations_list.append([customer_ids, similarity_scores])

# Create a DataFrame with columns for the recommended customers and their similarity scores
lookalike_df = pd.DataFrame(recommendations_list, index=lookalike_recommendations.keys(), columns=['Recommended_Customers', 'Similarity_Scores'])

# Save to CSV
lookalike_df.to_csv('Nancy_Gautam_Lookalike.csv', index_label='CustomerID')

# Displaying a preview of the saved file
print("\nLookalike recommendations have been saved to 'Nancy_Gautam_Lookalike.csv'. Preview:")
print(lookalike_df.head())






Lookalike recommendations have been saved to 'Nancy_Gautam_Lookalike.csv'. Preview:
       Recommended_Customers  \
C0001  [C0137, C0152, C0119]   
C0002  [C0142, C0002, C0178]   
C0003  [C0029, C0133, C0121]   
C0004  [C0139, C0113, C0102]   
C0005  [C0159, C0115, C0007]   

                                       Similarity_Scores  
C0001  [0.9999999999832881, 0.9999999999670558, 0.999...  
C0002  [1.0000000000000002, 1.0000000000000002, 1.000...  
C0003  [0.9999999998276338, 0.9999999982683516, 0.999...  
C0004  [0.9999999997985746, 0.9999999980341167, 0.999...  
C0005                                    [1.0, 1.0, 1.0]  


In [31]:
import numpy as np

# Calculate the average similarity score of the top N recommendations
similarity_scores = []
for customer_id, recommendations in lookalike_recommendations.items():
    for rec in recommendations:
        similarity_scores.append(rec[1])  # similarity score of the recommendation

# Average similarity score across all recommendations
average_similarity = np.mean(similarity_scores)
print(f"Average similarity score: {average_similarity:.4f}")


Average similarity score: 1.0000


In [32]:
# Recommend top 3 similar customers for the first 20 customers
lookalike_recommendations = {}

for i in range(20):  # For customers C0001 to C0020
    # Get similarities for the i-th customer
    similarities = similarity_matrix[i]
    
    # Get the indices of the 3 most similar customers
    similar_indices = similarities.argsort()[-4:-1][::-1]  # Excluding self (first index)
    
    # Create a dictionary of similar customers and their similarity scores
    similar_customers = [(merged_data['CustomerID'].iloc[j], similarities[j]) for j in similar_indices]
    lookalike_recommendations[merged_data['CustomerID'].iloc[i]] = similar_customers

# Print a few recommendations for the first 5 customers
for customer, recommendations in list(lookalike_recommendations.items())[:5]:
    print(f"Recommendations for Customer {customer}:")
    for rec in recommendations:
        print(f"\tCustomerID: {rec[0]}, Similarity Score: {rec[1]:.4f}")

Recommendations for Customer C0001:
	CustomerID: C0137, Similarity Score: 1.0000
	CustomerID: C0152, Similarity Score: 1.0000
	CustomerID: C0119, Similarity Score: 1.0000
Recommendations for Customer C0002:
	CustomerID: C0142, Similarity Score: 1.0000
	CustomerID: C0002, Similarity Score: 1.0000
	CustomerID: C0178, Similarity Score: 1.0000
Recommendations for Customer C0003:
	CustomerID: C0029, Similarity Score: 1.0000
	CustomerID: C0133, Similarity Score: 1.0000
	CustomerID: C0121, Similarity Score: 1.0000
Recommendations for Customer C0004:
	CustomerID: C0139, Similarity Score: 1.0000
	CustomerID: C0113, Similarity Score: 1.0000
	CustomerID: C0102, Similarity Score: 1.0000
Recommendations for Customer C0005:
	CustomerID: C0159, Similarity Score: 1.0000
	CustomerID: C0115, Similarity Score: 1.0000
	CustomerID: C0007, Similarity Score: 1.0000
