<a href="https://colab.research.google.com/github/KanikaRana29/Data-Science-Assignment/blob/main/Kanika_Rana_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing the libraries**

In [1]:
import pandas as pd
import numpy as np

**Loading the datasets**

In [26]:
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions=pd.read_csv('/content/Transactions.csv')

**Merge datasets for easy analysis as we don't have large datasets so we can easily perform operations on one merged dataset only**

In [27]:
merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')

**Feature Engineering**

In [28]:
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],  # Spending patterns
    'Category': lambda x: x.value_counts().idxmax(),  # Most purchased category
    'Region': 'first',  # Demographics
}).reset_index()
customer_features.columns = ['CustomerID', 'TotalSpend', 'AvgSpend', 'TopCategory', 'Region']
print(customer_features.head())

  CustomerID  TotalSpend  AvgSpend  TopCategory         Region
0      C0001     3354.52   670.904  Electronics  South America
1      C0002     1862.74   465.685   Home Decor           Asia
2      C0003     2725.38   681.345   Home Decor  South America
3      C0004     5354.88   669.360        Books  South America
4      C0005     2034.24   678.080  Electronics           Asia


**Encoding Categorical Values**

In [29]:
customer_features = pd.get_dummies(customer_features, columns=['Region', 'TopCategory'], drop_first=True)

**Customer feature matrix (matrix representation of customer features)**

In [30]:
customer_feature_matrix = customer_features.drop('CustomerID', axis=1).values

**Calculating Cosine Similarity**

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(customer_feature_matrix)

In [32]:
# Generate top 3 lookalikes for the first 20 customers
lookalike_map = {}
for idx, cust_id in enumerate(customer_features['CustomerID'][:20]):
    # similarity scores for the current customer
    similarity_scores = similarity_matrix[idx]

    # The top 3 similar customers (excluding the current customer itself)
    similar_customers = [(customer_features['CustomerID'][i], similarity_scores[i])
                         for i in range(len(similarity_scores)) if customer_features['CustomerID'][i] != cust_id]

    # Sort the customers based on similarity score (descending) and get the top 3
    top_3_similar = sorted(similar_customers, key=lambda x: x[1], reverse=True)[:3]

    # Storing the results in the lookalike map
    lookalike_map[cust_id] = top_3_similar

In [33]:
# Saving to Lookalike.csv
lookalike_df = pd.DataFrame([
    {'cust_id': k, 'lookalikes': v} for k, v in lookalike_map.items()
])
lookalike_df.to_csv('Lookalike.csv', index=False)

# results
print("Lookalike Map for First 20 Customers:\n", lookalike_map)

Lookalike Map for First 20 Customers:
 {'C0001': [('C0107', 0.9999999995277805), ('C0190', 0.9999999986747098), ('C0048', 0.9999999985800574)], 'C0002': [('C0178', 0.9999999867685928), ('C0158', 0.9999999321876165), ('C0117', 0.9999999318048751)], 'C0003': [('C0133', 0.9999999996151886), ('C0052', 0.999999998846642), ('C0158', 0.9999999909881527)], 'C0004': [('C0188', 0.9999999892176178), ('C0099', 0.9999999883793411), ('C0104', 0.9999999880317908)], 'C0005': [('C0186', 0.999999997396094), ('C0007', 0.9999999951365319), ('C0115', 0.999999986558959)], 'C0006': [('C0168', 0.9999999808715238), ('C0027', 0.9999999736694801), ('C0142', 0.9999999736694801)], 'C0007': [('C0115', 0.9999999978658433), ('C0005', 0.9999999951365319), ('C0186', 0.9999999854153196)], 'C0008': [('C0065', 0.9999999893695143), ('C0141', 0.9999999370477491), ('C0175', 0.9999999329031855)], 'C0009': [('C0062', 0.9999997700035874), ('C0061', 0.999999622345061), ('C0167', 0.9999995829277026)], 'C0010': [('C0092', 0.999999

In [34]:
from sklearn.metrics import average_precision_score

# Assuming `lookalike_map` contains recommendations for the first 20 customers
# We will need to assume some labels for this comparison
# For simplicity, we assume customers within the same region as "true" lookalikes.
# You can expand this for a more complex label generation.

# Generate a "true" lookalike label based on the region (or any other feature)
true_labels = {}

for cust_id in customer_features['CustomerID'][:20]:
    # Get the original region before one-hot encoding
    # by finding the one-hot encoded column with value 1
    region_cols = [col for col in customer_features.columns if col.startswith('Region_')]

    # Get the region value for the customer
    region = customer_features.loc[customer_features['CustomerID'] == cust_id, region_cols].idxmax(axis=1).values[0].replace('Region_', '')


    # Find customers within the same region based on the original region name
    true_labels[cust_id] = customer_features.loc[
        customer_features[region_cols].apply(lambda row: row.idxmax() == f'Region_{region}', axis=1),
        'CustomerID'
    ].tolist()

# Now, evaluate the top 3 recommended lookalikes for each customer
y_true = []
y_pred = []

for cust_id, recommendations in lookalike_map.items():
    # True lookalikes based on region
    true_similar_customers = true_labels[cust_id]

    # Get recommended lookalikes
    recommended_customers = [rec[0] for rec in recommendations]  # Extract customer IDs

    # For each recommendation, check if it's in the true lookalikes list
    for rec in recommended_customers:
        y_true.append(1 if rec in true_similar_customers else 0)
        y_pred.append(1)  # We assume all predictions are relevant to check precision/recall

# Calculate Precision, Recall, and F1-Score
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Precision: 0.7833
Recall: 1.0000
F1-Score: 0.8785
