In [11]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from google.colab import files

# Step 1: File Upload
uploaded = files.upload()

# Load datasets
transactions_df = pd.read_csv('/content/Transactions.csv')
customers_df = pd.read_csv('/content/Customers.csv')
products_df = pd.read_csv('/content/Products.csv')

# Step 2: Display Dataset Previews
print("Transactions Data:")
print(transactions_df.head(), "\n")

print("Customers Data:")
print(customers_df.head(), "\n")

print("Products Data:")
print(products_df.head())

# Step 3: Data Merging
merged_df = transactions_df.merge(customers_df, on='CustomerID').merge(products_df, on='ProductID')
print("Merged DataFrame:")
print(merged_df.head())

# Step 4: Feature Engineering
# Aggregating customer-specific data
customer_features = merged_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',          # Total spend
    'Quantity': 'sum',            # Total quantity purchased
    'ProductID': 'nunique',       # Number of unique products purchased
    'TransactionID': 'count'      # Total transactions made
}).reset_index()

# Rename columns for clarity
customer_features.columns = [
    'CustomerID', 'TotalSpend', 'TotalQuantity', 'UniqueProducts', 'TransactionCount'
]

# Normalize features using Min-Max Scaling
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(customer_features.iloc[:, 1:])
normalized_df = pd.DataFrame(normalized_features, columns=customer_features.columns[1:])
normalized_df['CustomerID'] = customer_features['CustomerID']
normalized_df.set_index('CustomerID', inplace=True)

# Step 5: Cosine Similarity Calculation
# Calculate cosine similarity matrix
cosine_sim_matrix = cosine_similarity(normalized_df)
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=normalized_df.index, columns=normalized_df.index)

# Step 6: Generate Lookalike Recommendations
def get_top_cosine_lookalikes(customer_id, top_n=3):
    """
    Get top N most similar customers using cosine similarity.
    """
    if customer_id in cosine_sim_df.index:
        similar_customers = cosine_sim_df[customer_id].sort_values(ascending=False)[1:top_n+1]
        return [(cust_id, round(score, 10)) for cust_id, score in similar_customers.items()]
    else:
        return []

# Generate lookalikes for the first 20 customers
lookalike_results = {}
for customer_id in normalized_df.index[:20]:  # First 20 customers
    lookalike_results[customer_id] = get_top_cosine_lookalikes(customer_id)

# Step 7: Save Results to CSV
lookalike_df = pd.DataFrame.from_dict(
    lookalike_results, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3']
)

lookalike_df.to_csv('Himanshu_Raj_Lookalike.csv', index_label='CustomerID')
print("Lookalike model results saved as 'Himanshu_Raj_Lookalike.csv'")

# Step 8: Output Example
print("\nSample Lookalike Recommendations:")
print(lookalike_df.head())


Saving Transactions.csv to Transactions.csv
Saving Products.csv to Products.csv
Saving Customers.csv to Customers.csv
Transactions Data:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price  
0      300.68  300.68  
1      300.68  300.68  
2      300.68  300.68  
3      601.36  300.68  
4      902.04  300.68   

Customers Data:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  