In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
customers = pd.read_csv('data/Customers.csv')
products = pd.read_csv('data/Products.csv')
transactions = pd.read_csv('data/Transactions.csv')

In [3]:
transactions = transactions.merge(customers, on='CustomerID', how='left')
transactions = transactions.merge(products, on='ProductID', how='left')


In [4]:
def create_customer_profiles(transactions):
    transactions['ProductInfo'] = transactions['ProductName'] + ' ' + transactions['Category']
    customer_profiles = transactions.groupby('CustomerID')['ProductInfo'].apply(lambda x: ' '.join(x)).reset_index()
    return customer_profiles

In [5]:
customer_profiles = create_customer_profiles(transactions)

In [6]:
vectorizer = TfidfVectorizer()
customer_vectors = vectorizer.fit_transform(customer_profiles['ProductInfo'])

In [7]:
similarity_matrix = cosine_similarity(customer_vectors)


In [8]:
customer_index_map = {customer: idx for idx, customer in enumerate(customer_profiles['CustomerID'])}


In [9]:
lookalike_results = {}
for customer_id in customers['CustomerID'][:20]:
    if customer_id in customer_index_map:
        customer_idx = customer_index_map[customer_id]
        similarity_scores = list(enumerate(similarity_matrix[customer_idx]))
        top_similar_customers = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        top_similar_customers = [entry for entry in top_similar_customers if entry[0] != customer_idx][:3]
        lookalike_results[customer_id] = [
            {'CustomerID': customer_profiles.iloc[entry[0]]['CustomerID'], 'Score': entry[1]}
            for entry in top_similar_customers
        ]


In [10]:
lookalike_df = pd.DataFrame([
    {'CustomerID': customer_id, 'Lookalikes': lookalikes}
    for customer_id, lookalikes in lookalike_results.items()
])

lookalike_df.to_csv('Lookalike.csv', index=False)
