#### Load datasets

In [3]:
import pandas as pd

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')


#### Inspect data

In [6]:
customers.head(), products.head(), transactions.head()

(  CustomerID        CustomerName         Region  SignupDate
 0      C0001    Lawrence Carroll  South America  2022-07-10
 1      C0002      Elizabeth Lutz           Asia  2022-02-13
 2      C0003      Michael Rivera  South America  2024-03-07
 3      C0004  Kathleen Rodriguez  South America  2022-10-09
 4      C0005         Laura Weber           Asia  2022-08-15,
   ProductID              ProductName     Category   Price
 0      P001     ActiveWear Biography        Books  169.30
 1      P002    ActiveWear Smartwatch  Electronics  346.30
 2      P003  ComfortLiving Biography        Books   44.12
 3      P004            BookWorld Rug   Home Decor   95.69
 4      P005          TechPro T-Shirt     Clothing  429.31,
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
 0        T00001      C0199      P067  2024-08-25 12:38:23         1   
 1        T00112      C0146      P067  2024-05-27 22:23:54         1   
 2        T00166      C0127      P067   2024-04-25 7:38:55    

##### Merge Datasets

In [10]:
# Merge customers and transactions
customer_transactions = pd.merge(transactions, customers, on='CustomerID')

# Merge with products to get product details
customer_transactions = pd.merge(customer_transactions, products, on='ProductID')


In [12]:
print(customers.columns)
print(transactions.columns)
print(products.columns)


Index(['CustomerID', 'CustomerName', 'Region', 'SignupDate'], dtype='object')
Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')
Index(['ProductID', 'ProductName', 'Category', 'Price'], dtype='object')


#### Check for Missing Data

In [15]:
print(customers['CustomerID'].isnull().sum())
print(transactions['CustomerID'].isnull().sum())
print(products['ProductID'].isnull().sum())


0
0
0


#### Check Merge Results

In [18]:
customer_transactions.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 7:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [36]:
# Merge transactions with customers
merged_transactions_customers = pd.merge(transactions, customers, on='CustomerID', how='inner')
print(merged_transactions_customers.head())

# Merge with products
merged_all = pd.merge(merged_transactions_customers, products, on='ProductID', how='inner')
print(merged_all.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067   2024-04-25 7:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price     CustomerName         Region  SignupDate  \
0      300.68  300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68  300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68  300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36  300.68  Travis Campbell  South America  2024-04-11   
4      902.04  300.68    Timothy Perez         Europe  2022-03-15   

   RegionEncoded  
0              1  
1              0  
2              1  
3              3  
4              1  
  TransactionID CustomerID ProductID      Tr

##### Step 1: Feature Engineering

In [27]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

from sklearn.preprocessing import LabelEncoder

# Encode the 'Region' column
label_encoder = LabelEncoder()
customer_transactions['RegionEncoded'] = label_encoder.fit_transform(customer_transactions['Region'])

# Now you can aggregate and calculate the features
customer_features = customer_transactions.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    purchase_frequency=('TransactionID', 'count'),
    region=('RegionEncoded', 'first')
).reset_index()

# Check the resulting features
customer_features.head()

Unnamed: 0,CustomerID,total_spend,purchase_frequency,region
0,C0001,3354.52,5,3
1,C0002,1862.74,4,0
2,C0003,2725.38,4,3
3,C0004,5354.88,8,3
4,C0005,2034.24,3,0


##### Compute Similarity

In [41]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Prepare the features (total_spend, purchase_frequency, and region)
customer_feature_matrix = customer_features[['total_spend', 'purchase_frequency', 'region']].values

# Compute the cosine similarity
similarity_matrix = cosine_similarity(customer_feature_matrix)

# Convert similarity matrix to DataFrame for easier interpretation
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Display the similarity matrix
similarity_df.head()


CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,0.999999,1.0,1.0,1.0,1.0,1.0,1.0,0.999998,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
C0002,0.999999,1.0,0.999999,1.0,1.0,0.999999,1.0,1.0,0.999999,1.0,...,0.999999,0.999999,1.0,1.0,1.0,0.999999,1.0,0.999999,1.0,0.999999
C0003,1.0,0.999999,1.0,1.0,0.999999,1.0,0.999999,0.999999,0.999998,0.999999,...,1.0,1.0,0.999999,1.0,1.0,0.999999,1.0,1.0,1.0,0.999999
C0004,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.999998,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
C0005,1.0,1.0,0.999999,1.0,1.0,1.0,1.0,1.0,0.999998,0.999999,...,0.999999,0.999999,1.0,1.0,1.0,1.0,1.0,0.999999,1.0,1.0


##### Generate Recommendations

In [43]:
def generate_recommendations(customer_id, top_n=5):
    # Get similarity scores for the given customer
    similar_scores = similarity_df[customer_id]
    
    # Sort the customers by similarity (descending order)
    similar_customers = similar_scores.sort_values(ascending=False).iloc[1:top_n+1]
    
    return similar_customers

# Example: Generate recommendations for customer C0001
customer_id = 'C0001'
recommendations = generate_recommendations(customer_id)

# Display the recommendations
print(f"Top 5 customers similar to {customer_id}:")
print(recommendations)


Top 5 customers similar to C0001:
CustomerID
C0137    1.0
C0152    1.0
C0107    1.0
C0035    1.0
C0131    1.0
Name: C0001, dtype: float64


- Merged the customers, transactions, and products datasets to create a comprehensive customer_transactions dataframe, ensuring all relevant information is available.
- Check for Missing Data:

- You confirmed that there is no missing data in the CustomerID and ProductID columns, making the dataset clean for analysis.
Feature Engineering:

- Encoded the Region column and aggregated the data based on CustomerID to create meaningful features such as total_spend, purchase_frequency, and region.
- Compute Similarity:

- You computed the cosine similarity between customers based on their spending behavior, frequency of purchases, and region.
The resulting similarity matrix is displayed, with high similarity scores indicating customers with similar behavior.
Generate Recommendations:

- You implemented a function generate_recommendations that takes a customer ID and returns the top N most similar customers based on the computed similarity matrix.
The example output showed that for customer C0001, the most similar customers were those with high similarity scores.

In [53]:
import pandas as pd

# Function to save both similarity matrix and recommendations
def save_to_csv(df, file_name):
    try:
        df.to_csv(file_name, index=False if 'SimilarityScore' in df.columns else True)
        print(f"Saved to {file_name}")
    except Exception as e:
        print(f"Error: {e}")

# Save similarity matrix
save_to_csv(similarity_df, 'lookalike_model_customers.csv')

# Save recommendations
recommendations_df = recommendations.reset_index().rename(columns={0: 'SimilarityScore'})
recommendations_df['TargetCustomerID'] = 'C0001'
save_to_csv(recommendations_df, 'customer_recommendations.csv')


Saved to lookalike_model_customers.csv
Saved to customer_recommendations.csv


In [None]:
Saved to lookalike_model_customers.csv
Saved to customer_recommendations.csv