In [7]:
#import
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

In [9]:
# Merge datasets on CustomerID
data = pd.merge(transactions, customers, on='CustomerID')

In [10]:
# Feature engineering (aggregate transaction history)
agg_data = data.groupby('CustomerID').agg({
    'ProductID': lambda x: list(x),  # List of products purchased
    'TotalValue': 'sum',            # Total spending
    'Quantity': 'sum',              # Total quantity purchased
    'TransactionDate': 'count'      # Number of transactions
}).reset_index()

In [11]:
# Merge aggregated data back with customer attributes
final_data = pd.merge(customers, agg_data, on='CustomerID')

In [12]:
# Encode categorical variables
encoder = OneHotEncoder(sparse_output=False)  # Updated parameter
categorical_features = ['Region']  # Add more as needed
encoded_data = encoder.fit_transform(final_data[categorical_features])

In [13]:
# Combine with numerical features
numerical_features = ['TotalValue', 'Quantity', 'TransactionDate']
numerical_data = final_data[numerical_features].values

In [14]:
# Concatenate features
feature_matrix = pd.concat([pd.DataFrame(encoded_data), pd.DataFrame(numerical_data)], axis=1)

In [15]:
# Normalize the feature matrix
scaler = MinMaxScaler()
feature_matrix = scaler.fit_transform(feature_matrix)

In [16]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(feature_matrix)

In [17]:
# Create recommendations
lookalike_map = {}
for idx, cust_id in enumerate(final_data['CustomerID']):
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    
    # Exclude the customer themselves and sort by similarity score
    top_similar = sorted(
        [s for s in similarity_scores if s[0] != idx], 
        key=lambda x: x[1], 
        reverse=True
    )[:3]
    
    # Map CustomerID to top similar customers and their scores
    lookalike_map[cust_id] = [(final_data['CustomerID'].iloc[t[0]], round(t[1], 4)) for t in top_similar]

In [18]:
# Convert to DataFrame and save as Lookalike.csv
lookalike_df = pd.DataFrame([
    {'cust_id': k, 'similar_customers': v} for k, v in lookalike_map.items()
])
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike.csv created successfully!")

Lookalike.csv created successfully!
