# Importing Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets

In [3]:
customers = pd.read_csv('/content/drive/MyDrive/Zeotap_Assignment/Customers.csv')
products = pd.read_csv('/content/drive/MyDrive/Zeotap_Assignment/Products.csv')
transactions = pd.read_csv('/content/drive/MyDrive/Zeotap_Assignment/Transactions.csv')

# Convert date columns to datetime

In [4]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Merge datasets

In [5]:
data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# **Feature Engineering**

# Create customer profiles with aggregated features

In [7]:
customer_profiles = data.groupby('CustomerID').agg({
    'Region': 'first',
    'Category': lambda x: ' '.join(x),  # Combine purchased product categories
    'TotalValue': 'sum',               # Total spending
    'Quantity': 'sum'                  # Total quantity purchased
}).reset_index()


# Combine text-based features

In [8]:
customer_profiles['ProfileText'] = customer_profiles['Region'] + ' ' + customer_profiles['Category']

# Vectorize text-based features using TF-IDF

In [9]:
tfidf = TfidfVectorizer()
profile_vectors = tfidf.fit_transform(customer_profiles['ProfileText'])

# Compute similarity matrix

In [11]:
similarity_matrix = cosine_similarity(profile_vectors)

# Find top 3 lookalike customers for each of the first 20 customers

In [12]:
 # Dictionary to store lookalike results for each customer
lookalike_map = {}

for i in range(20):  # For CustomerID: C0001 to C0020
    customer_id = customer_profiles.iloc[i]['CustomerID']  # Get the CustomerID of the current customer
    similarity_scores = list(enumerate(similarity_matrix[i])) # Get similarity scores for the current customer


    # Exclude the customer itself and sort other customers by similarity score in descending order
    similarity_scores = sorted(
        [(customer_profiles.iloc[j]['CustomerID'], score) for j, score in similarity_scores if j != i],
        key=lambda x: x[1], # Sort by the similarity score
        reverse=True # Descending order
    )

    # Store the top 3 most similar customers and their similarity scores in the dictionary
    lookalike_map[customer_id] = similarity_scores[:3]  # Top 3 lookalikes


# Save lookalike results to CSV

In [13]:
# List to store rows for the CSV
lookalike_data = []
for cust_id, lookalikes in lookalike_map.items(): # Loop through each customer and their lookalikes
    lookalike_data.append({
        'CustomerID': cust_id, # Current customer ID

         # List of tuples (lookalike customer ID, similarity score) rounded to 4 decimal places
        'Lookalikes': [(l_id, round(score, 4)) for l_id, score in lookalikes]
    })

# Convert the lookalike data to a DataFrame
lookalike_df = pd.DataFrame(lookalike_data)

# Save the DataFrame to a CSV file named 'Lookalike.csv'
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike Model Completed. Results saved to Lookalike.csv.")

Lookalike Model Completed. Results saved to Lookalike.csv.
