In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers_url = 'https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE'
transactions_url = 'https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF'

customers = pd.read_csv(customers_url)
transactions = pd.read_csv(transactions_url)

In [10]:
def build_lookalike_model():
    # Aggregating transaction data
    customer_profiles = transactions.groupby('CustomerID').agg({'TotalValue': 'sum', 'Quantity': 'sum'}).reset_index()
    customer_profiles = customer_profiles.merge(customers, on='CustomerID')

    # Encoding categorical features
    customer_profiles['RegionEncoded'] = customer_profiles['Region'].astype('category').cat.codes

    # Preparing data for similarity calculation
    features = ['TotalValue', 'Quantity', 'RegionEncoded']
    scaler = StandardScaler()
    feature_matrix = scaler.fit_transform(customer_profiles[features])

    # Calculating cosine similarity matrix
    similarity_matrix = cosine_similarity(feature_matrix)

    # Extract lookalikes for each customer
    lookalikes = {}
    for idx, row in customer_profiles.iterrows():
        similar_indices = np.argsort(-similarity_matrix[idx])[:4]  # Top 3 lookalikes + self
        similar_customers = [
            (customer_profiles.iloc[i]['CustomerID'], similarity_matrix[idx][i])
            for i in similar_indices if i != idx
        ]
        lookalikes[row['CustomerID']] = similar_customers[:3]

    # Filter for customers C0001 to C0020
    lookalike_df = pd.DataFrame({
        'CustomerID': list(lookalikes.keys()),
        'Lookalikes': [str(value) for value in lookalikes.values()]
    })
    lookalike_df = lookalike_df[lookalike_df['CustomerID'].str.startswith('C000') &
                                (lookalike_df['CustomerID'] <= 'C0020')]

    # Save to CSV
    lookalike_df.to_csv("Lookalike.csv", index=False)
    print("Lookalike.csv generated with top 3 lookalikes for customers C0001-C0020!")
    
    return lookalike_df

In [11]:
lookalike_df = build_lookalike_model()
print(lookalike_df)


Lookalike.csv generated with top 3 lookalikes for customers C0001-C0020!
  CustomerID                                         Lookalikes
0      C0001  [('C0107', np.float64(0.9930023406314495)), ('...
1      C0002  [('C0088', np.float64(0.996991768235917)), ('C...
2      C0003  [('C0147', np.float64(0.9959158383692094)), ('...
3      C0004  [('C0113', np.float64(0.9939371956296096)), ('...
4      C0005  [('C0186', np.float64(0.997560932214978)), ('C...
5      C0006  [('C0048', np.float64(0.9879227167799908)), ('...
6      C0007  [('C0146', np.float64(0.999996089620408)), ('C...
7      C0008  [('C0109', np.float64(0.9860003282398785)), ('...
8      C0009  [('C0198', np.float64(0.999976844081536)), ('C...


In [12]:
print("\n### Accessing Lookalike.csv ###")
from IPython.display import FileLink
file_link = FileLink("Lookalike.csv")
file_link


### Accessing Lookalike.csv ###
