### Importing neccesary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

### Loading the datasets

In [2]:
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

### Merging the product dataset with transaction dataset


In [3]:

transactions = transactions.merge(products, on="ProductID", how="left")

### Feature Engineering


In [5]:
# Feature Engineering
def prepare_customer_features(customers, transactions):
    transaction_summary = transactions.groupby("CustomerID").agg(
        total_spent=("TotalValue", "sum"),
        avg_spent=("TotalValue", "mean"),
        transaction_count=("TransactionID", "count"),
        unique_categories=("Category", lambda x: x.nunique())
    ).reset_index()

    # Merge with customer information
    customer_data = customers.merge(transaction_summary, on="CustomerID", how="left")
    customer_data["Region"] = customer_data["Region"].fillna("Unknown")
    
    # Handle missing values (replace NaNs with zeros)
    customer_data.fillna(0, inplace=True)
    return customer_data

In [6]:
customer_features = prepare_customer_features(customers, transactions)

In [7]:
customer_features

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,total_spent,avg_spent,transaction_count,unique_categories
0,C0001,Lawrence Carroll,South America,2022-07-10,3354.52,670.904000,5.0,3.0
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1862.74,465.685000,4.0,2.0
2,C0003,Michael Rivera,South America,2024-03-07,2725.38,681.345000,4.0,3.0
3,C0004,Kathleen Rodriguez,South America,2022-10-09,5354.88,669.360000,8.0,3.0
4,C0005,Laura Weber,Asia,2022-08-15,2034.24,678.080000,3.0,2.0
...,...,...,...,...,...,...,...,...
195,C0196,Laura Watts,Europe,2022-06-07,4982.88,1245.720000,4.0,3.0
196,C0197,Christina Harvey,Europe,2023-03-21,1928.65,642.883333,3.0,2.0
197,C0198,Rebecca Ray,Europe,2022-02-27,931.83,465.915000,2.0,2.0
198,C0199,Andrea Jenkins,Europe,2022-12-03,1979.28,494.820000,4.0,2.0


### Convert Data for Similarity Computation

In [16]:
def vectorize_features(customer_data):
    encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    region_encoded = encoder.fit_transform(customer_data[["Region"]])    
    # Scale numerical data
    scaler = StandardScaler()
    numerical_data = scaler.fit_transform(
        customer_data[["total_spent", "avg_spent", "transaction_count", "unique_categories"]]
    )  
    # Combine numerical and encoded categorical data
    feature_matrix = np.hstack([region_encoded, numerical_data])
    return feature_matrix

In [17]:
feature_matrix = vectorize_features(customer_features)

In [18]:
feature_matrix

array([[ 0.        ,  0.        ,  0.        , ..., -0.05478053,
         0.        ,  0.17223279],
       [ 1.        ,  0.        ,  0.        , ..., -0.9039848 ,
        -0.45129368, -0.87160229],
       [ 0.        ,  0.        ,  0.        , ..., -0.01157526,
        -0.45129368,  0.17223279],
       ...,
       [ 0.        ,  1.        ,  0.        , ..., -0.90303305,
        -1.35388105, -0.87160229],
       [ 0.        ,  1.        ,  0.        , ..., -0.78342303,
        -0.45129368, -0.87160229],
       [ 1.        ,  0.        ,  0.        , ...,  1.1072471 ,
         0.        ,  1.21606787]], shape=(200, 8))

In [19]:
similarity_matrix = cosine_similarity(feature_matrix)

In [20]:
similarity_matrix

array([[ 1.        , -0.02922829,  0.87631918, ..., -0.01162104,
        -0.03601935,  0.05358653],
       [-0.02922829,  1.        ,  0.18251406, ...,  0.71572797,
         0.70128911, -0.4346459 ],
       [ 0.87631918,  0.18251406,  1.        , ...,  0.34211311,
         0.17860772, -0.0345325 ],
       ...,
       [-0.01162104,  0.71572797,  0.34211311, ...,  1.        ,
         0.9282682 , -0.589807  ],
       [-0.03601935,  0.70128911,  0.17860772, ...,  0.9282682 ,
         1.        , -0.6782523 ],
       [ 0.05358653, -0.4346459 , -0.0345325 , ..., -0.589807  ,
        -0.6782523 ,  1.        ]], shape=(200, 200))

### Function for output data


In [22]:
def get_lookalikes(similarity_matrix, customer_ids, target_ids):
    lookalikes = {}
    for target_idx, target_id in enumerate(target_ids):
        # Get similarity scores for the target customer
        scores = list(enumerate(similarity_matrix[target_idx]))
        # Sort by similarity score (exclude the customer itself)
        scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]
        # Map top 3 similar customers
        lookalikes[target_id] = [(customer_ids[idx], round(score, 3)) for idx, score in scores]
    return lookalikes

In [23]:
all_customer_ids = customer_features["CustomerID"].tolist()
lookalike_targets = customer_features[customer_features["CustomerID"].str.contains("C00")][:20]["CustomerID"]

In [24]:
lookalike_targets

0     C0001
1     C0002
2     C0003
3     C0004
4     C0005
5     C0006
6     C0007
7     C0008
8     C0009
9     C0010
10    C0011
11    C0012
12    C0013
13    C0014
14    C0015
15    C0016
16    C0017
17    C0018
18    C0019
19    C0020
Name: CustomerID, dtype: object

In [25]:
lookalike_map = get_lookalikes(similarity_matrix, all_customer_ids, lookalike_targets)

In [26]:
lookalike_map

{'C0001': [('C0152', np.float64(1.0)),
  ('C0107', np.float64(0.966)),
  ('C0011', np.float64(0.937))],
 'C0002': [('C0142', np.float64(0.978)),
  ('C0128', np.float64(0.945)),
  ('C0134', np.float64(0.937))],
 'C0003': [('C0129', np.float64(0.922)),
  ('C0001', np.float64(0.876)),
  ('C0152', np.float64(0.872))],
 'C0004': [('C0108', np.float64(0.983)),
  ('C0165', np.float64(0.974)),
  ('C0012', np.float64(0.967))],
 'C0005': [('C0159', np.float64(1.0)),
  ('C0123', np.float64(0.98)),
  ('C0186', np.float64(0.978))],
 'C0006': [('C0158', np.float64(0.972)),
  ('C0187', np.float64(0.934)),
  ('C0148', np.float64(0.934))],
 'C0007': [('C0140', np.float64(0.976)),
  ('C0080', np.float64(0.955)),
  ('C0078', np.float64(0.954))],
 'C0008': [('C0109', np.float64(0.973)),
  ('C0098', np.float64(0.94)),
  ('C0139', np.float64(0.921))],
 'C0009': [('C0010', np.float64(0.976)),
  ('C0060', np.float64(0.974)),
  ('C0199', np.float64(0.951))],
 'C0010': [('C0199', np.float64(0.992)),
  ('C0009',

In [27]:
output_data = {
    "cust_id": list(lookalike_map.keys()),
    "similar_customers": [lookalike_map[key] for key in lookalike_map.keys()]
}

output_df = pd.DataFrame(output_data)

In [28]:
output_df

Unnamed: 0,cust_id,similar_customers
0,C0001,"[(C0152, 1.0), (C0107, 0.966), (C0011, 0.937)]"
1,C0002,"[(C0142, 0.978), (C0128, 0.945), (C0134, 0.937)]"
2,C0003,"[(C0129, 0.922), (C0001, 0.876), (C0152, 0.872)]"
3,C0004,"[(C0108, 0.983), (C0165, 0.974), (C0012, 0.967)]"
4,C0005,"[(C0159, 1.0), (C0123, 0.98), (C0186, 0.978)]"
5,C0006,"[(C0158, 0.972), (C0187, 0.934), (C0148, 0.934)]"
6,C0007,"[(C0140, 0.976), (C0080, 0.955), (C0078, 0.954)]"
7,C0008,"[(C0109, 0.973), (C0098, 0.94), (C0139, 0.921)]"
8,C0009,"[(C0010, 0.976), (C0060, 0.974), (C0199, 0.951)]"
9,C0010,"[(C0199, 0.992), (C0009, 0.976), (C0062, 0.944)]"


In [29]:
output_df.to_csv("Lookalike.csv", index=False)
print("Lookalike.csv generated successfully!")

Lookalike.csv generated successfully!
