In [30]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity


In [31]:
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')


In [32]:
transactions_merged = transactions_df.merge(products_df, on='ProductID').merge(customers_df, on='CustomerID')


In [33]:
customer_product_summary = transactions_merged.pivot_table(index='CustomerID',
                                                           columns='ProductID',
                                                           values='Quantity',
                                                           aggfunc='sum',
                                                           fill_value=0)


In [34]:
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(customer_product_summary)


In [35]:
cos_sim_matrix = cosine_similarity(tfidf_matrix)


In [36]:
def get_similar_customers(customer_id, sim_matrix, customer_ids, top_n=3):
    customer_idx = customer_ids.index(customer_id)
    sim_scores = list(enumerate(sim_matrix[customer_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Exclude the customer's own score
    similar_customers = [(customer_ids[i], score) for i, score in sim_scores]
    return similar_customers



In [37]:
customer_ids = list(customers_df['CustomerID'])
lookalike_dict = {}

for customer_id in customer_ids[:20]:
    similar_customers = get_similar_customers(customer_id, cos_sim_matrix, customer_ids)
    lookalike_dict[customer_id] = similar_customers


In [39]:
lookalike_df = pd.DataFrame(list(lookalike_dict.items()), columns=['CustomerID', 'Lookalikes'])
lookalike_df['Lookalikes'] = lookalike_df['Lookalikes'].apply(lambda x: [{"CustomerID": cust_id, "Score": "%.2f" % score} for cust_id, score in x])
lookalike_df.to_csv('Lookalike.csv', index=False)
