In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

In [6]:
def prepare_customer_features():
    transaction_products = pd.merge(transactions_df, products_df[['ProductID', 'Category']], on='ProductID')
    
    customer_features = pd.DataFrame()
    
    customer_features['CustomerID'] = customers_df['CustomerID']
    customer_features['Region'] = customers_df['Region']
    
    customer_features['DaysSinceSignup'] = (pd.Timestamp('2024-12-31') - 
                                          pd.to_datetime(customers_df['SignupDate'])).dt.days
    
 
    transaction_stats = transaction_products.groupby('CustomerID').agg({
        'TransactionID': 'count', 
        'TotalValue': ['sum', 'mean'], 
        'Quantity': 'sum' 
    }).fillna(0)
    
    transaction_stats.columns = ['TransactionCount', 'TotalSpend', 'AvgPurchaseValue', 'TotalQuantity']
    customer_features = customer_features.merge(transaction_stats, left_on='CustomerID', 
                                              right_index=True, how='left').fillna(0)
    
    category_pivot = pd.crosstab(transaction_products['CustomerID'], 
                                transaction_products['Category'])
    category_pivot = category_pivot.div(category_pivot.sum(axis=1), axis=0).fillna(0)
    customer_features = customer_features.merge(category_pivot, left_on='CustomerID', 
                                              right_index=True, how='left').fillna(0)
    
    return customer_features

In [7]:
def preprocess_features(customer_features):
    region_dummies = pd.get_dummies(customer_features['Region'], prefix='Region')
    
    numerical_features = customer_features.drop(['CustomerID', 'Region'], axis=1)
    processed_features = pd.concat([numerical_features, region_dummies], axis=1)
    

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(processed_features)
    
    return scaled_features, processed_features.columns

In [8]:
def calculate_similarity_scores(customer_features, scaled_features):
    similarity_matrix = cosine_similarity(scaled_features)
    
    lookalike_dict = {}
    
    # Get top 3 similar customers for first 20 customers
    for idx in range(20):
        customer_id = customer_features['CustomerID'].iloc[idx]
        similarity_scores = similarity_matrix[idx]
        
        similar_indices = np.argsort(similarity_scores)[::-1][1:4]
        
        similar_customers = []
        for similar_idx in similar_indices:
            similar_customer_id = customer_features['CustomerID'].iloc[similar_idx]
            similarity_score = similarity_scores[similar_idx]
            similar_customers.append((similar_customer_id, round(similarity_score, 4)))
        
        lookalike_dict[customer_id] = similar_customers
    
    return lookalike_dict

In [9]:
def create_lookalike_recommendations():
    customer_features = prepare_customer_features()
    scaled_features, feature_names = preprocess_features(customer_features)
    
    lookalike_dict = calculate_similarity_scores(customer_features, scaled_features)
    
    # Create DataFrame for output
    recommendations = []
    for customer_id, similar_customers in lookalike_dict.items():
        recommendations.append({
            'CustomerID': customer_id,
            'Lookalikes': str(similar_customers)  # Convert list to string for CSV storage
        })
    
    recommendations_df = pd.DataFrame(recommendations)
    recommendations_df.to_csv('Rahul_Yadav_Lookalike.csv', index=False)
    
    return recommendations_df


In [10]:
recommendations = create_lookalike_recommendations()
display(recommendations)


Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0112', np.float64(0.8784)), ('C0120', np.f..."
1,C0002,"[('C0134', np.float64(0.9356)), ('C0106', np.f..."
2,C0003,"[('C0031', np.float64(0.9256)), ('C0129', np.f..."
3,C0004,"[('C0113', np.float64(0.956)), ('C0104', np.fl..."
4,C0005,"[('C0007', np.float64(0.9749)), ('C0140', np.f..."
5,C0006,"[('C0187', np.float64(0.9048)), ('C0171', np.f..."
6,C0007,"[('C0005', np.float64(0.9749)), ('C0140', np.f..."
7,C0008,"[('C0098', np.float64(0.841)), ('C0194', np.fl..."
8,C0009,"[('C0198', np.float64(0.8793)), ('C0062', np.f..."
9,C0010,"[('C0061', np.float64(0.8964)), ('C0062', np.f..."


In [11]:
customer_features = prepare_customer_features()
scaled_features, feature_names = preprocess_features(customer_features)

print("\nFeature Names:")
for i, feature in enumerate(feature_names):
    print(f"{feature}")


Feature Names:
DaysSinceSignup
TransactionCount
TotalSpend
AvgPurchaseValue
TotalQuantity
Books
Clothing
Electronics
Home Decor
Region_Asia
Region_Europe
Region_North America
Region_South America
