In [1]:
import pandas as pd
import gdown
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers_url = "https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE"
products_url = "https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0"
transactions_url = "https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF"

In [3]:
gdown.download(customers_url, "Customers.csv", quiet=False)
gdown.download(products_url, "Products.csv", quiet=False)
gdown.download(transactions_url, "Transactions.csv", quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE
To: /content/Customers.csv
100%|██████████| 8.54k/8.54k [00:00<00:00, 17.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0
To: /content/Products.csv
100%|██████████| 4.25k/4.25k [00:00<00:00, 9.54MB/s]
Downloading...
From: https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF
To: /content/Transactions.csv
100%|██████████| 54.7k/54.7k [00:00<00:00, 49.8MB/s]


'Transactions.csv'

In [4]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [5]:
data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [6]:
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [7]:
# Create a user profile based on aggregated transaction data
def create_customer_profiles(data):
    data['CustomerProfile'] = (
        data['ProductName'] + " " +
        data['Category'] + " " +
        data['Region']
    )
    customer_profiles = data.groupby('CustomerID')['CustomerProfile'].apply(lambda x: ' '.join(x)).reset_index()
    return customer_profiles

customer_profiles = create_customer_profiles(data)

# Use TF-IDF Vectorizer to transform the profiles into numerical format
vectorizer = TfidfVectorizer()
customer_vectors = vectorizer.fit_transform(customer_profiles['CustomerProfile'])

# Compute cosine similarity between all customer profiles
similarity_matrix = cosine_similarity(customer_vectors)

In [8]:
lookalike_map = {}
for idx, customer_id in enumerate(customer_profiles['CustomerID']):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = [
        {"LookalikeCustomerID": customer_profiles['CustomerID'][i], "SimilarityScore": round(score, 4)}
        for i, score in similarity_scores[1:4]
    ]
    lookalike_map[customer_id] = top_3


lookalike_rows = []
for cust_id, lookalikes in lookalike_map.items():
    row = {"CustomerID": cust_id}
    for i, lookalike in enumerate(lookalikes, 1):
        row[f"Lookalike{i}_customerID"] = lookalike["LookalikeCustomerID"]
        row[f"Lookalike{i}_similarity_score"] = lookalike["SimilarityScore"]
    lookalike_rows.append(row)

lookalike_df = pd.DataFrame(lookalike_rows)
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)
subset = lookalike_df[lookalike_df['CustomerID'].isin(customers['CustomerID'][:20])]
print(subset)

   CustomerID Lookalike1_customerID  Lookalike1_similarity_score  \
0       C0001                 C0039                       0.8965   
1       C0002                 C0173                       0.9130   
2       C0003                 C0181                       0.9056   
3       C0004                 C0118                       0.9414   
4       C0005                 C0162                       0.8739   
5       C0006                 C0187                       0.9232   
6       C0007                 C0045                       0.8856   
7       C0008                 C0154                       0.9278   
8       C0009                 C0062                       0.8473   
9       C0010                 C0141                       0.8331   
10      C0011                 C0087                       0.8971   
11      C0012                 C0076                       0.9341   
12      C0013                 C0102                       0.9381   
13      C0014                 C0128             