In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [2]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
# Merge datasets
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [4]:
# Feature engineering: Combine customer and transaction data
customer_profiles = merged_data.groupby('CustomerID').agg({
    'Region': 'first',
    'Category': lambda x: ' '.join(x),  # Concatenate product categories
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum'     # Total quantity purchased
}).reset_index()

In [5]:
# Normalize numerical features (TotalValue and Quantity)
scaler = MinMaxScaler()
customer_profiles[['TotalValue', 'Quantity']] = scaler.fit_transform(customer_profiles[['TotalValue', 'Quantity']])

In [6]:
# Combine all text features into a single column for similarity calculations
customer_profiles['CombinedText'] = customer_profiles['Region'] + ' ' + customer_profiles['Category']

In [7]:
# Calculate similarity using TF-IDF and cosine similarity
tfidf = TfidfVectorizer()
text_features = tfidf.fit_transform(customer_profiles['CombinedText'])

In [8]:
# Combine text and numerical features
combined_features = pd.concat([
    pd.DataFrame(text_features.toarray()),
    customer_profiles[['TotalValue', 'Quantity']].reset_index(drop=True)
], axis=1)

In [9]:
# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(combined_features)

In [10]:
# Generate lookalike recommendations for the first 20 customers
lookalike_results = {}
for i, customer_id in enumerate(customer_profiles['CustomerID'][:20]):
    # Get similarity scores for the current customer
    scores = list(enumerate(similarity_matrix[i]))
    # Sort by similarity score in descending order (excluding self)
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    scores = [(customer_profiles['CustomerID'][idx], score) for idx, score in scores if idx != i]
    # Take top 3 lookalikes
    lookalike_results[customer_id] = scores[:3]

In [11]:
# Save results to Lookalike.csv
lookalike_df = pd.DataFrame([
    {
        'CustomerID': cust_id,
        'Lookalike1': lookalikes[0][0],
        'Score1': lookalikes[0][1],
        'Lookalike2': lookalikes[1][0],
        'Score2': lookalikes[1][1],
        'Lookalike3': lookalikes[2][0],
        'Score3': lookalikes[2][1]
    }
    for cust_id, lookalikes in lookalike_results.items()
])

lookalike_df.to_csv('Mamta_Kumari_Lookalike.csv', index=False)