In [26]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Data Preprocessing: Convert 'SignupDate' to datetime and create additional features
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
customers_df['SignupYear'] = customers_df['SignupDate'].dt.year
customers_df['SignupMonth'] = customers_df['SignupDate'].dt.month

# Merging the transaction data with the customer and product details
merged_df = pd.merge(transactions_df, customers_df, on='CustomerID', how='left')
merged_df = pd.merge(merged_df, products_df, on='ProductID', how='left')

# Feature Engineering: Aggregate transaction data for each customer
customer_features = merged_df.groupby('CustomerID').agg(
    total_value=('TotalValue', 'sum'),
    total_quantity=('Quantity', 'sum'),
    num_transactions=('TransactionID', 'count'),
    most_common_category=('Category', lambda x: x.mode()[0]),
).reset_index()

# Normalize features
scaler = StandardScaler()
customer_features[['total_value', 'total_quantity', 'num_transactions']] = scaler.fit_transform(
    customer_features[['total_value', 'total_quantity', 'num_transactions']])

# We will use these normalized features for similarity calculation
customer_features_final = customer_features[['total_value', 'total_quantity', 'num_transactions']]

# Calculate similarity matrix using Cosine Similarity
cos_sim = cosine_similarity(customer_features_final)

# Function to get the top 3 similar customers for a given customer
def get_top_3_lookalikes(customer_id, cos_sim_matrix, customers_list):
    # Find the index of the target customer
    customer_idx = customers_list.index(customer_id)
    
    # Get similarity scores for the target customer with all other customers
    similarity_scores = list(enumerate(cos_sim_matrix[customer_idx]))
    
    # Sort the similarity scores in descending order
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Select the top 3 similar customers (excluding the customer itself)
    top_3 = [(customers_list[i[0]], i[1]) for i in similarity_scores[1:4]]
    
    return top_3

# Prepare the list of customer IDs
customers_list = customer_features['CustomerID'].tolist()

# Generate Lookalike recommendations for the first 20 customers (C0001 - C0020)
lookalike_recommendations = []
for customer_id in customers_list[:20]:  # For customers C0001 to C0020
    top_3 = get_top_3_lookalikes(customer_id, cos_sim, customers_list)
    for lookalike in top_3:
        lookalike_recommendations.append({
            'CustomerID': customer_id,
            'LookalikeCustomerID': lookalike[0],
            'SimilarityScore': lookalike[1]
        })

# Save the recommendations to a CSV file
lookalike_df = pd.DataFrame(lookalike_recommendations)

lookalike_df.to_csv('Kavitha_L_Lookalike.csv', index=False)

print(lookalike_df.head())


  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0164         0.997598
1      C0001               C0103         0.995394
2      C0001               C0069         0.986073
3      C0002               C0029         0.999754
4      C0002               C0031         0.998986
