## Task 2: Lookalike Model

#### 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

#### 2. Load datasets


In [2]:
customers = pd.read_csv(r'dataset/Customers.csv')
products = pd.read_csv(r'dataset/Products.csv')
transactions = pd.read_csv(r'dataset/Transactions.csv')

#### Changing data type

In [3]:
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])

#### 3. Feature Engineering:

#### 3.1 Grouping the customers based on CustomerID

In [4]:
customer_transactions = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price': 'mean',
    'TransactionDate': 'max'
}).reset_index()

In [5]:
customer_transactions.head()

Unnamed: 0,CustomerID,TotalValue,Quantity,Price,TransactionDate
0,C0001,3354.52,12,278.334,2024-11-02 17:04:16
1,C0002,1862.74,10,208.92,2024-12-03 01:41:41
2,C0003,2725.38,14,195.7075,2024-08-24 18:54:04
3,C0004,5354.88,23,240.63625,2024-12-23 14:13:52
4,C0005,2034.24,7,291.603333,2024-11-04 00:30:22


In [6]:
customer_data = pd.merge(customers, customer_transactions, on='CustomerID')

In [7]:
customer_data['Recency'] = (pd.to_datetime('today') - customer_data['TransactionDate']).dt.days


#### 3.2 Normalization

In [8]:
scaler = StandardScaler()
numerical_features = ['TotalValue', 'Quantity', 'Price', 'Recency']
customer_data[numerical_features] = scaler.fit_transform(customer_data[numerical_features])


#### 3.3 Calculate Cosine similarity Similarity

In [9]:
# Cosine similarity calculation
features = customer_data[numerical_features].values
cosine_similarities = cosine_similarity(features)

In [10]:
# Create lookalike recommendations
lookalike_results = {}
for idx, row in customer_data.iterrows():
    target_customer_id = row['CustomerID']
    similarities = cosine_similarities[idx]
    similarities[idx] = 0
    similar_indices = similarities.argsort()[-3:][::-1]
    similar_customers = customer_data.iloc[similar_indices]
    
    lookalike_results[target_customer_id] = [(similar_customers.iloc[i]['CustomerID'], similarities[similar_indices[i]]) for i in range(3)]


In [11]:
lookalike_df = []
for cust_id, lookalikes in lookalike_results.items():
    if int(cust_id[-2:])>20:
        break
    for lookalike, score in lookalikes:
        lookalike_df.append({'CustomerID': cust_id, 'LookalikeCustomerID': lookalike, 'SimilarityScore': score})


#### 4. Save Results

In [12]:
lookalike_df = pd.DataFrame(lookalike_df)
lookalike_df.to_csv(r'dataset/Lookalike.csv', index=False)

In [20]:
from sklearn.cluster import KMeans
import pandas as pd

customer_features = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Quantity': 'sum', 
}).reset_index()


customer_features.rename(columns={'TotalValue': 'TotalSpending',
                                  'TransactionID': 'TransactionFrequency',
                                  'Quantity': 'TotalQuantity'}, inplace=True)


kmeans = KMeans(n_clusters=5, random_state=42)
customer_features['Segment'] = kmeans.fit_predict(customer_features[['TotalSpending', 'TransactionFrequency', 'TotalQuantity']])

customers_with_segments = customers.merge(customer_features[['CustomerID', 'Segment']], on='CustomerID', how='left')
print(customers_with_segments.head())


  CustomerID        CustomerName         Region SignupDate  Segment
0      C0001    Lawrence Carroll  South America 2022-07-10      1.0
1      C0002      Elizabeth Lutz           Asia 2022-02-13      1.0
2      C0003      Michael Rivera  South America 2024-03-07      1.0
3      C0004  Kathleen Rodriguez  South America 2022-10-09      0.0
4      C0005         Laura Weber           Asia 2022-08-15      1.0


In [21]:
def evaluate_alignment(lookalike_results, customer_segments):
    correct_recommendations = 0
    total_recommendations = 0

    for cust_id, lookalikes in lookalike_results.items():
        cust_segment = customer_segments[customer_segments['CustomerID'] == cust_id]['Segment'].values[0]
        for lookalike, _ in lookalikes:
            lookalike_segment = customer_segments[customer_segments['CustomerID'] == lookalike]['Segment'].values[0]
            if cust_segment == lookalike_segment:
                correct_recommendations += 1
            total_recommendations += 1

    accuracy = correct_recommendations / total_recommendations
    print(f"Segment Alignment Accuracy: {accuracy:.2f}")

evaluate_alignment(lookalike_results, customers_with_segments)


Segment Alignment Accuracy: 0.66
