TASK 2: LOOKALIKE MODEL

1.DATA PREPROCESSING AND FEATURE ENGINEERING

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Convert date columns to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Merge the datasets (Transactions + Customers + Products)
merged_data = pd.merge(transactions, customers, on='CustomerID', how='left')
merged_data = pd.merge(merged_data, products, on='ProductID', how='left')

# Check merged data
print(merged_data.head())


  TransactionID CustomerID ProductID     TransactionDate  Quantity  \
0        T00001      C0199      P067 2024-08-25 12:38:23         1   
1        T00112      C0146      P067 2024-05-27 22:23:54         1   
2        T00166      C0127      P067 2024-04-25 07:38:55         1   
3        T00272      C0087      P067 2024-03-26 22:55:37         2   
4        T00363      C0070      P067 2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe 2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia 2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe 2024-04-04   
3      601.36   300.68  Travis Campbell  South America 2024-04-11   
4      902.04   300.68    Timothy Perez         Europe 2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker  Electronic

2.FEATURE ENGINEERING

In [2]:
# Create aggregated features for customers
customer_features = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean'),
    categories_purchased=('Category', lambda x: ', '.join(x.unique()))
).reset_index()

# Show customer features
print(customer_features.head())

  CustomerID  total_spent  num_transactions  avg_transaction_value  \
0      C0001      3354.52                 5                670.904   
1      C0002      1862.74                 4                465.685   
2      C0003      2725.38                 4                681.345   
3      C0004      5354.88                 8                669.360   
4      C0005      2034.24                 3                678.080   

                categories_purchased  
0     Books, Home Decor, Electronics  
1               Home Decor, Clothing  
2  Home Decor, Clothing, Electronics  
3     Books, Home Decor, Electronics  
4            Home Decor, Electronics  


3.CALCULATE SIMILARITY

In [3]:
# Feature selection for similarity calculation
feature_columns = ['total_spent', 'num_transactions', 'avg_transaction_value']

# Scale the features (important for similarity calculation)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[feature_columns])

# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(scaled_features)

# Show similarity between first 5 customers
print(similarity_matrix[:5, :5])

[[ 1.          0.97368428  0.55235809 -0.45939188  0.54090993]
 [ 0.97368428  1.          0.70863336 -0.63052062  0.69951105]
 [ 0.55235809  0.70863336  1.         -0.99411512  0.99989428]
 [-0.45939188 -0.63052062 -0.99411512  1.         -0.9955419 ]
 [ 0.54090993  0.69951105  0.99989428 -0.9955419   1.        ]]


4.GENERATE RECOMENDATIONS

In [4]:
# Function to get top N similar customers
def get_top_n_similar(customers, similarity_matrix, top_n=3):
    recommendations = {}
    for idx, customer_id in enumerate(customers['CustomerID'][:20]):  # First 20 customers
        similarity_scores = similarity_matrix[idx]
        
        # Exclude self-comparison (same customer)
        similarity_scores[idx] = -1
        
        # Get indices of the top N most similar customers
        similar_indices = np.argsort(similarity_scores)[-top_n:][::-1]
        similar_customers = [(customers['CustomerID'].iloc[i], similarity_scores[i]) for i in similar_indices]
        
        recommendations[customer_id] = similar_customers
    
    return recommendations

# Get top 3 similar customers for the first 20 customers
top_similar_customers = get_top_n_similar(customer_features, similarity_matrix, top_n=3)

# Show recommendations for the first customer
print(top_similar_customers['C0001'])


[('C0137', np.float64(0.9993600788417096)), ('C0152', np.float64(0.9956575062125335)), ('C0121', np.float64(0.9930123335059389))]


5. CREATE Lookalike.csv file

In [6]:
# Format the recommendations into a dictionary for output
lookalike_map = {cust_id: [(similar_cust, score) for similar_cust, score in recommendations] for cust_id, recommendations in top_similar_customers.items()}

# Convert to DataFrame for easier export
lookalike_data = []
for cust_id, recommendations in lookalike_map.items():
    for similar_cust, score in recommendations:
        lookalike_data.append([cust_id, similar_cust, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

# Save to CSV
lookalike_df.to_csv('Ahalya_Innamuri_Lookalike.csv', index=False)

# Check the output
print(lookalike_df.head())


  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0137         0.999360
1      C0001               C0152         0.995658
2      C0001               C0121         0.993012
3      C0002               C0029         0.999638
4      C0002               C0199         0.998867
