In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load data
customers_df = pd.read_csv(r"C:/Users/Administrator/Downloads/Customers.csv")
products_df = pd.read_csv(r"C:/Users/Administrator/Downloads/Products.csv")
transactions_df = pd.read_csv(r"C:/Users/Administrator/Downloads/Transactions.csv")

# Merge customer and transaction data
merged_df = transactions_df.merge(customers_df, on='CustomerID', how='left')

# Aggregate transaction data by CustomerID to get total spending, number of transactions, etc.
agg_data = merged_df.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean')
).reset_index()

# Merge with customer demographic data (e.g., Region)
customer_features = customers_df[['CustomerID', 'Region']]
final_customer_data = pd.merge(agg_data, customer_features, on='CustomerID')

# Handle categorical data (Region) with one-hot encoding
final_customer_data = pd.get_dummies(final_customer_data, columns=['Region'], drop_first=True)

# Normalize numerical features for similarity comparison
scaler = StandardScaler()
numerical_features = final_customer_data[['total_spending', 'num_transactions', 'avg_transaction_value']]
final_customer_data[['total_spending', 'num_transactions', 'avg_transaction_value']] = scaler.fit_transform(numerical_features)

# Ensure the feature set is prepared correctly before computing similarity
features = final_customer_data.drop('CustomerID', axis=1)

# Compute cosine similarity between all customers
cosine_sim = cosine_similarity(features)

# Convert cosine similarity matrix into a DataFrame for easier access
cosine_sim_df = pd.DataFrame(cosine_sim, index=final_customer_data['CustomerID'], columns=final_customer_data['CustomerID'])

# Now we have cosine_sim_df defined. Let's verify it
print(cosine_sim_df.head())

# Initialize a dictionary to store the results
lookalike_recommendations = {}

# For each customer in the specified range (C0001 to C0020)
for customer_id in final_customer_data['CustomerID'].iloc[:20]:
    # Get the similarity scores for the customer
    similar_customers = cosine_sim_df[customer_id].sort_values(ascending=False)
    
    # Exclude the customer themselves by skipping the first entry (similarity = 1)
    top_3_similar = similar_customers.iloc[1:4]
    
    # Store the top 3 similar customers and their scores
    # We need to flatten the list into a structure with 'Lookalike1', 'Score1', 'Lookalike2', 'Score2', etc.
    recommendations = []
    for idx, score in zip(top_3_similar.index, top_3_similar.values):
        recommendations.append(idx)  # Lookalike ID
        recommendations.append(score)  # Similarity score
    
    lookalike_recommendations[customer_id] = recommendations

# Now let's create the DataFrame from the corrected structure
lookalike_df = pd.DataFrame.from_dict(lookalike_recommendations, orient='index')
lookalike_df.columns = ['Lookalike1', 'Score1', 'Lookalike2', 'Score2', 'Lookalike3', 'Score3']

# Save the results to CSV
lookalike_df.to_csv(r"C:/Users/Administrator/Downloads/Lookalike.csv")





CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.091325  0.873578  0.466153  0.050734  0.443770   
C0002       0.091325  1.000000  0.373253 -0.544009  0.699511 -0.600803   
C0003       0.873578  0.373253  1.000000 -0.022035  0.526667  0.439193   
C0004       0.466153 -0.544009 -0.022035  1.000000 -0.858947  0.174953   
C0005       0.050734  0.699511  0.526667 -0.858947  1.000000  0.018354   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001      -0.008215  0.008683  0.082873  0.076983  ...  0.945666  0.722249   
C0002       0.103107 -0.112276  0.917882  0.834864  ...  0.385544  0.749795   
C0003       0.407695 -0.374568  0.348233  0.296602  ...  0.867288  0.865173   
C0004      -0.719804  0.665620 -0.510923 -0.427138  ...  0.344623 -0.122123   
C0005  