In [1]:
import pandas as pd

In [3]:
#loading dataset
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [5]:
#displaying dataset
print("Customers Data:\n", customers.head(), "\n")
print("Products Data:\n", products.head(), "\n")
print("Transactions Data:\n", transactions.head())

Customers Data:
   CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15 

Products Data:
   ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31 

Transactions Data:
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C

In [9]:
#checking missing values 
print("Missing Values:\n")
print("Customers:\n", customers.isnull().sum(), "\n")
print("Products:\n", products.isnull().sum(), "\n")
print("Transactions:\n", transactions.isnull().sum(), "\n")

Missing Values:

Customers:
 CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64 

Products:
 ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64 

Transactions:
 TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64 



In [11]:
#merge data here 
merged_df = transactions.merge(customers, on="CustomerID", how="left")
merged_df = merged_df.merge(products, on="ProductID", how="left")
print(merged_df.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067   2024-04-25 7:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [13]:
#feature engineering 
#total amount spent
customer_spending = merged_df.groupby("CustomerID")["TotalValue"].sum().reset_index()
customer_spending.columns = ["CustomerID", "TotalSpent"]

In [15]:
#total transactions made
transaction_count = merged_df.groupby("CustomerID")["TransactionID"].count().reset_index()
transaction_count.columns = ["CustomerID", "TransactionCount"]

In [17]:
#unique products purchased
unique_products = merged_df.groupby("CustomerID")["ProductID"].nunique().reset_index()
unique_products.columns = ["CustomerID", "UniqueProductsBought"]

In [19]:
#unique categories purchased
unique_categories = merged_df.groupby("CustomerID")["Category"].nunique().reset_index()
unique_categories.columns = ["CustomerID", "UniqueCategoriesBought"]

In [21]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
customers["RegionEncoded"] = encoder.fit_transform(customers["Region"])

In [23]:
#merge all features
customer_features = customers[["CustomerID", "RegionEncoded"]]
customer_features = customer_features.merge(customer_spending, on="CustomerID", how="left")
customer_features = customer_features.merge(transaction_count, on="CustomerID", how="left")
customer_features = customer_features.merge(unique_products, on="CustomerID", how="left")
customer_features = customer_features.merge(unique_categories, on="CustomerID", how="left")

# Fill missing values with 0
customer_features = customer_features.fillna(0)

print(customer_features.head())

  CustomerID  RegionEncoded  TotalSpent  TransactionCount  \
0      C0001              3     3354.52               5.0   
1      C0002              0     1862.74               4.0   
2      C0003              3     2725.38               4.0   
3      C0004              3     5354.88               8.0   
4      C0005              0     2034.24               3.0   

   UniqueProductsBought  UniqueCategoriesBought  
0                   5.0                     3.0  
1                   4.0                     2.0  
2                   4.0                     3.0  
3                   8.0                     3.0  
4                   3.0                     2.0  


In [25]:
#calculating similarities using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

feature_matrix = customer_features.drop("CustomerID", axis=1)
similarity_matrix = cosine_similarity(feature_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])

print(similarity_df.head())  


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.999999  1.000000  1.000000  1.000000  1.000000   
C0002       0.999999  1.000000  0.999999  0.999999  1.000000  0.999998   
C0003       1.000000  0.999999  1.000000  1.000000  0.999999  1.000000   
C0004       1.000000  0.999999  1.000000  1.000000  1.000000  1.000000   
C0005       1.000000  1.000000  0.999999  1.000000  1.000000  0.999999   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.999999  0.999999  0.999996  0.999999  ...  1.000000  1.000000   
C0002       0.999999  1.000000  0.999997  1.000000  ...  0.999999  0.999999   
C0003       0.999999  0.999999  0.999996  0.999999  ...  1.000000  1.000000   
C0004       1.000000  0.999999  0.999995  0.999999  ...  1.000000  0.999999   
C0005  

In [31]:
#Finding top 3 similar customers

def get_top_3_lookalikes(customer_id):
    scores = similarity_df[customer_id].drop(customer_id)  
    top_3 = scores.nlargest(3)  
    return list(zip(top_3.index, top_3.values)) 

lookalike_results = {}

for customer in customer_features["CustomerID"][:20]: 
    lookalike_results[customer] = get_top_3_lookalikes(customer)

lookalike_df = pd.DataFrame([
    [cust] + [sim[0] for sim in lookalike_results[cust]] + [sim[1] for sim in lookalike_results[cust]]
    for cust in lookalike_results
], columns=["CustomerID", "Similar1", "Similar2", "Similar3", "Score1", "Score2", "Score3"])


lookalike_df.to_csv("LookalikeCustomers.csv", index=False)

print("LookalikeCustomers file created successfully!")


LookalikeCustomers file created successfully!
