In [11]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Step 1: Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Step 2: Merge the datasets
# Merge Transactions with Products to get product details
transactions = transactions.merge(products, on='ProductID', how='left')

# Merge Transactions with Customers to get customer profile details
customer_data = transactions.merge(customers, on='CustomerID', how='left')

# Step 3: Feature Engineering
# Aggregate transaction data for customers
customer_features = customer_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',                # Total spending by customer
    'ProductID': 'count',               # Total transactions by customer
    'Category': lambda x: x.mode()[0]  # Most frequently purchased product category
}).reset_index()

# Rename columns for clarity
customer_features.columns = ['CustomerID', 'TotalSpending', 'TransactionCount', 'TopCategory']

# Encode the 'TopCategory' column using OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
encoded_category = encoder.fit_transform(customer_features[['TopCategory']])

# Combine the encoded features with numerical features
features = pd.concat(
    [customer_features[['TotalSpending', 'TransactionCount']],
     pd.DataFrame(encoded_category, index=customer_features.index)],
    axis=1
)

# Ensure all column names are strings to avoid sklearn validation errors
features.columns = features.columns.astype(str)

# Step 4: Scaling the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Step 5: Calculate Cosine Similarity
similarity_matrix = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarity_matrix,
                             index=customer_features['CustomerID'],
                             columns=customer_features['CustomerID'])

# Step 6: Generate Recommendations
lookalikes = {}
for customer_id in customer_features['CustomerID'][:20]:  # First 20 customers (C0001 to C0020)
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Top 3 (exclude self)
    lookalikes[customer_id] = list(similar_customers.items())

# Step 7: Prepare Output CSV
lookalike_csv = pd.DataFrame({
    'cust_id': list(lookalikes.keys()),
    'list': [','.join([f"{cust}:{score:.2f}" for cust, score in value]) for value in lookalikes.values()]
})

# Save to CSV
lookalike_csv.to_csv('Immaraju_Srilekha_Lookalike.csv', index=False)

print("Lookalike Model completed and output saved to 'Immaraju_Srilekha_Lookalike.csv'")

Lookalike Model completed and output saved to 'Immaraju_Srilekha_Lookalike.csv'
