1. Load and Merge Data
Start by loading the datasets into pandas DataFrames and merging them.

In [1]:
import pandas as pd

# Read the uploaded CSV files
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Display the first few rows to ensure data is loaded correctly
print(customers.head())
print(products.head())
print(transactions.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [3]:
import pandas as pd

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Display a sample of the merged data
print(merged_data.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

2. Feature Engineering
Generate relevant features for each customer, such as:

Total Spending: Total value of transactions.
Transaction Count: Number of transactions.
Favorite Category: The product category most frequently purchased.

In [5]:
# Aggregate transaction data for each customer
customer_features = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spending
    "TransactionID": "count",  # Number of transactions
    "Category": lambda x: x.mode()[0]  # Most purchased category
}).rename(columns={"TotalValue": "TotalSpending", "TransactionID": "TransactionCount"})

# Reset the index for easy handling
customer_features.reset_index(inplace=True)
print(customer_features.head())


  CustomerID  TotalSpending  TransactionCount     Category
0      C0001        3354.52                 5  Electronics
1      C0002        1862.74                 4     Clothing
2      C0003        2725.38                 4   Home Decor
3      C0004        5354.88                 8        Books
4      C0005        2034.24                 3  Electronics


3. Normalize Features
Scale numerical features to ensure fair comparison.

In [9]:
from sklearn.preprocessing import MinMaxScaler

# Normalize numerical features
scaler = MinMaxScaler()
customer_features[["TotalSpending", "TransactionCount"]] = scaler.fit_transform(
    customer_features[["TotalSpending", "TransactionCount"]]
)
print(customer_features.head())


  CustomerID  TotalSpending  TransactionCount     Category
0      C0001       0.308942               0.4  Electronics
1      C0002       0.168095               0.3     Clothing
2      C0003       0.249541               0.3   Home Decor
3      C0004       0.497806               0.7        Books
4      C0005       0.184287               0.2  Electronics


4. Calculate Similarity
Use cosine similarity to measure similarity between customers based on their features.

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(
    customer_features[["TotalSpending", "TransactionCount"]]
)

# Convert the similarity matrix to a DataFrame for easier handling
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=customer_features["CustomerID"],
    columns=customer_features["CustomerID"]
)
print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.989226  0.999346  0.999221  0.996229  0.966611   
C0002       0.989226  1.000000  0.983283  0.994234  0.972792  0.918682   
C0003       0.999346  0.983283  1.000000  0.997139  0.998715  0.975247   
C0004       0.999221  0.994234  0.997139  1.000000  0.992027  0.955742   
C0005       0.996229  0.972792  0.998715  0.992027  1.000000  0.985199   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.978093  0.970479  0.958036  0.983415  ...  0.998494  0.995197   
C0002       0.937079  0.995332  0.989679  0.999372  ...  0.995768  0.998806   
C0003       0.984983  0.961120  0.947041  0.976211  ...  0.995855  0.991005   
C0004       0.969113  0.979243  0.968605  0.989808  ...  0.999881  0.998286   
C0005  

In [None]:
5. Find Top 3 Similar Customers
Extract the top 3 similar customers for each of the first 20 customers (C0001 to C0020).

In [13]:
# Function to find top 3 similar customers for a given customer
def get_top_similar(customers, similarity_matrix, n=3):
    results = {}
    for customer_id in customers:
        # Get similarity scores for the customer and sort them
        scores = similarity_matrix.loc[customer_id].sort_values(ascending=False)
        top_customers = scores.iloc[1:n+1]  # Exclude self (highest similarity)
        results[customer_id] = list(zip(top_customers.index, top_customers.values.round(2)))
    return results

# Get the first 20 customers
top_20_customers = customer_features["CustomerID"][:20]
top_similar_customers = get_top_similar(top_20_customers, similarity_df)

# Display results
print(top_similar_customers)


{'C0001': [('C0173', 1.0), ('C0145', 1.0), ('C0137', 1.0)], 'C0002': [('C0103', 1.0), ('C0024', 1.0), ('C0034', 1.0)], 'C0003': [('C0155', 1.0), ('C0132', 1.0), ('C0107', 1.0)], 'C0004': [('C0164', 1.0), ('C0156', 1.0), ('C0021', 1.0)], 'C0005': [('C0193', 1.0), ('C0092', 1.0), ('C0100', 1.0)], 'C0006': [('C0138', 1.0), ('C0079', 1.0), ('C0148', 1.0)], 'C0007': [('C0082', 1.0), ('C0085', 1.0), ('C0171', 1.0)], 'C0008': [('C0047', 1.0), ('C0111', 1.0), ('C0157', 1.0)], 'C0009': [('C0019', 1.0), ('C0172', 1.0), ('C0161', 1.0)], 'C0010': [('C0084', 1.0), ('C0109', 1.0), ('C0184', 1.0)], 'C0011': [('C0022', 1.0), ('C0197', 1.0), ('C0046', 1.0)], 'C0012': [('C0102', 1.0), ('C0198', 1.0), ('C0136', 1.0)], 'C0013': [('C0100', 1.0), ('C0073', 1.0), ('C0064', 1.0)], 'C0014': [('C0014', 1.0), ('C0110', 1.0), ('C0097', 1.0)], 'C0015': [('C0149', 1.0), ('C0087', 1.0), ('C0053', 1.0)], 'C0016': [('C0099', 1.0), ('C0105', 1.0), ('C0048', 1.0)], 'C0017': [('C0086', 1.0), ('C0118', 1.0), ('C0192', 1.0

6. Save Output as Lookalike.csv
Create a CSV file with the required format.

In [15]:
# Convert the dictionary to a DataFrame
lookalike_data = [
    {"CustomerID": cust_id, "Lookalikes": str(similarities)}
    for cust_id, similarities in top_similar_customers.items()
]
lookalike_df = pd.DataFrame(lookalike_data)

# Save the output
lookalike_df.to_csv("FirstName_LastName_Lookalike.csv", index=False)
print("Lookalike.csv saved!")


Lookalike.csv saved!
