In [45]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [3]:
customers = pd.read_csv('Customers.csv')
customers

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15
...,...,...,...,...
195,C0196,Laura Watts,Europe,2022-06-07
196,C0197,Christina Harvey,Europe,2023-03-21
197,C0198,Rebecca Ray,Europe,2022-02-27
198,C0199,Andrea Jenkins,Europe,2022-12-03


In [5]:
products = pd.read_csv('Products.csv')
products

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.30
1,P002,ActiveWear Smartwatch,Electronics,346.30
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31
...,...,...,...,...
95,P096,SoundWave Headphones,Electronics,307.47
96,P097,BookWorld Cookbook,Books,319.34
97,P098,SoundWave Laptop,Electronics,299.93
98,P099,SoundWave Mystery Book,Books,354.29


In [7]:
transactions = pd.read_csv('Transactions.csv')
transactions

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68
...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86


In [11]:
# Merge customers and transactions datasets
merged = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")
merged

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,Jacob Holt,South America,2022-01-22,SoundWave Smartwatch,Electronics,459.86
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,Mrs. Kimberly Wright,North America,2024-04-07,SoundWave Smartwatch,Electronics,459.86
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,Tyler Haynes,North America,2024-09-21,SoundWave Smartwatch,Electronics,459.86
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,Joshua Hamilton,Asia,2024-11-11,SoundWave Smartwatch,Electronics,459.86


In [13]:
##Create a pivot table: CustomerID x ProductID with quantities as values
customer_product_matrix = merged.pivot_table(index="CustomerID", columns="ProductID", values="Quantity", aggfunc="sum", fill_value=0)
customer_product_matrix

ProductID,P001,P002,P003,P004,P005,P006,P007,P008,P009,P010,...,P091,P092,P093,P094,P095,P096,P097,P098,P099,P100
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
C0002,0,0,0,4,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
C0003,0,4,0,0,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0004,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,3,0,0,0
C0005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0199,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Scale the data for similarity computation
scaler = StandardScaler()
customer_product_scaled = scaler.fit_transform(customer_product_matrix)
customer_product_scaled

array([[-0.19155441, -0.19831279, -0.23403059, ..., -0.17354437,
        -0.12925978, -0.13742396],
       [-0.19155441, -0.19831279, -0.23403059, ..., -0.17354437,
        -0.12925978, -0.13742396],
       [-0.19155441,  6.11596653, -0.23403059, ..., -0.17354437,
        -0.12925978, -0.13742396],
       ...,
       [-0.19155441, -0.19831279, -0.23403059, ..., -0.17354437,
        -0.12925978, -0.13742396],
       [-0.19155441, -0.19831279, -0.23403059, ..., -0.17354437,
        -0.12925978, -0.13742396],
       [-0.19155441, -0.19831279, -0.23403059, ..., -0.17354437,
        -0.12925978, -0.13742396]])

In [21]:
# Compute cosine similarity between customers
customer_similarity = cosine_similarity(customer_product_scaled)
customer_similarity

array([[ 1.        , -0.04882928, -0.06147586, ..., -0.03838458,
         0.34902818, -0.06724444],
       [-0.04882928,  1.        , -0.03569919, ..., -0.00165323,
        -0.03453342, -0.04030028],
       [-0.06147586, -0.03569919,  1.        , ..., -0.03180765,
        -0.04766667, -0.04790728],
       ...,
       [-0.03838458, -0.00165323, -0.03180765, ...,  1.        ,
        -0.00988126,  0.365046  ],
       [ 0.34902818, -0.03453342, -0.04766667, ..., -0.00988126,
         1.        , -0.05334491],
       [-0.06724444, -0.04030028, -0.04790728, ...,  0.365046  ,
        -0.05334491,  1.        ]])

In [23]:
# Map CustomerIDs back to indices
customer_ids = customer_product_matrix.index.tolist()
customer_id_to_index = {cid: idx for idx, cid in enumerate(customer_ids)}

In [25]:
# Map CustomerIDs back to indices
customer_ids = customer_product_matrix.index.tolist()
customer_id_to_index = {cid: idx for idx, cid in enumerate(customer_ids)}


In [27]:
lookalike_data = {}

In [31]:
for customer_id in customer_ids[:20]:  # First 20 customers
    index = customer_id_to_index[customer_id]
    similarity_scores = customer_similarity[index]
    
    # Get top 3 similar customers excluding itself
    similar_customers = sorted([(customer_ids[i], score) for i, score in enumerate(similarity_scores) if i != index],key=lambda x: x[1], reverse=True)[:3]
    lookalike_data[customer_id] = similar_customers

In [33]:
lookalike_rows = [{
        "CustomerID": customer_id,
        "Lookalike1": similar_customers[0][0],
        "Score1": similar_customers[0][1],
        "Lookalike2": similar_customers[1][0],
        "Score2": similar_customers[1][1],
        "Lookalike3": similar_customers[2][0],
        "Score3": similar_customers[2][1],}
    for customer_id, similar_customers in lookalike_data.items()]

In [35]:
lookalike_df = pd.DataFrame(lookalike_rows)
lookalike_df

Unnamed: 0,CustomerID,Lookalike1,Score1,Lookalike2,Score2,Lookalike3,Score3
0,C0001,C0194,0.404928,C0104,0.374002,C0020,0.366609
1,C0002,C0030,0.404617,C0091,0.383778,C0071,0.320158
2,C0003,C0181,0.477572,C0134,0.471016,C0144,0.4238
3,C0004,C0070,0.351901,C0175,0.316098,C0132,0.279599
4,C0005,C0096,0.487456,C0023,0.470252,C0055,0.3821
5,C0006,C0040,0.486909,C0178,0.397811,C0058,0.314163
6,C0007,C0079,0.617442,C0118,0.478277,C0020,0.456615
7,C0008,C0144,0.326751,C0169,0.278885,C0091,0.26056
8,C0009,C0140,0.533441,C0083,0.530842,C0162,0.498893
9,C0010,C0094,0.515064,C0092,0.405517,C0143,0.366012


In [37]:
print(lookalike_df.head())

  CustomerID Lookalike1    Score1 Lookalike2    Score2 Lookalike3    Score3
0      C0001      C0194  0.404928      C0104  0.374002      C0020  0.366609
1      C0002      C0030  0.404617      C0091  0.383778      C0071  0.320158
2      C0003      C0181  0.477572      C0134  0.471016      C0144  0.423800
3      C0004      C0070  0.351901      C0175  0.316098      C0132  0.279599
4      C0005      C0096  0.487456      C0023  0.470252      C0055  0.382100


In [39]:
# Save Lookalike.csv
lookalike_df.to_csv("Lookalike.csv", index=False)

In [41]:
# Verify recommendation quality
recommendation_quality = []


In [47]:
for customer_id, recommendations in lookalike_data.items():
    original_index = customer_id_to_index[customer_id]
    original_profile = customer_product_scaled[original_index]

    # Check similarity scores
    for similar_customer, score in recommendations:
        similar_index = customer_id_to_index[similar_customer]
        similar_profile = customer_product_scaled[similar_index]

        # Calculate cosine similarity manually as verification
        manual_score = np.dot(original_profile, similar_profile) / (
            np.linalg.norm(original_profile) * np.linalg.norm(similar_profile)
        )
        recommendation_quality.append({
            "CustomerID": customer_id,
            "Lookalike": similar_customer,
            "ReportedScore": score,
            "CalculatedScore": manual_score
        })

In [49]:
# Convert to DataFrame for inspection
quality_df = pd.DataFrame(recommendation_quality)

In [57]:
# Check for discrepancies between reported and calculated scores
discrepancy_check = quality_df[np.abs(quality_df["ReportedScore"] -quality_df["CalculatedScore"]) > 1e-6]
discrepancy_check

Unnamed: 0,CustomerID,Lookalike,ReportedScore,CalculatedScore


In [59]:
# Display results
print("Sample Quality Verification:")
print(quality_df.head())

Sample Quality Verification:
  CustomerID Lookalike  ReportedScore  CalculatedScore
0      C0001     C0194       0.404928         0.404928
1      C0001     C0104       0.374002         0.374002
2      C0001     C0020       0.366609         0.366609
3      C0002     C0030       0.404617         0.404617
4      C0002     C0091       0.383778         0.383778


In [61]:
if discrepancy_check.empty:
    print("All similarity scores are accurate.")
else:
    print("Discrepancies found in similarity scores.")

All similarity scores are accurate.
