In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

customers = pd.read_csv('D://Projects//Zeotap Assessment//Customers.csv')
products = pd.read_csv('D://Projects//Zeotap Assessment//Products.csv')
transactions = pd.read_csv('D://Projects//Zeotap Assessment//Transactions.csv')

merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

merged_data['CustomerFeatures'] = (
    merged_data['Region'] + ' ' +
    merged_data['ProductName'] + ' ' +
    merged_data['Category']
)


customer_profiles = merged_data.groupby('CustomerID')['CustomerFeatures'].apply(lambda x: ' '.join(x)).reset_index()

# I used TF-IDF Vectorization to converts textual data into numerical form for similarity calculations
# I used Cosine similarity as it computes the similarity between customers based on their profiles.
vectorizer = TfidfVectorizer()
customer_vectors = vectorizer.fit_transform(customer_profiles['CustomerFeatures'])

similarity_matrix = cosine_similarity(customer_vectors)

# Finding the top 3 similar customers for the first 20 customers (C0001 to C0020) and storing their similarity scores.
lookalike_map = {}
customer_ids = customer_profiles['CustomerID'].tolist()

for idx, customer_id in enumerate(customer_ids[:20]):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    top_lookalikes = [(customer_ids[i], score) for i, score in similarity_scores if customer_ids[i] != customer_id][:3]
    lookalike_map[customer_id] = top_lookalikes

# Converting lookalike map to dataframe for obtaining csv file
lookalike_list = []
for cust_id, lookalikes in lookalike_map.items():
    lookalike_list.append({
        'CustomerID': cust_id,
        'Lookalikes': str(lookalikes)
    })

lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv('D://Projects//Zeotap Assessment//Mithunkumar_Ramanathan_Lookalike.csv', index=False)