In [2]:
import pandas as pd
import numpy as np


In [3]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [4]:
# Convert date columns
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])

In [5]:
# Merge datasets to form a unified view
# Merge transactions with products to include product details
transactions_with_products = pd.merge(transactions, products, on="ProductID", how="left")

# Merge the resulting dataset with customers to include customer details
full_data = pd.merge(transactions_with_products, customers, on="CustomerID", how="left")

# Display the first few rows of the unified dataset
full_data.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [6]:
full_data.shape


(1000, 13)

In [7]:
# Calculate total spend per customer
customer_total_spend = full_data.groupby("CustomerID")["TotalValue"].sum().rename("TotalSpend")


In [8]:

customer_total_spend.head()

CustomerID
C0001    3354.52
C0002    1862.74
C0003    2725.38
C0004    5354.88
C0005    2034.24
Name: TotalSpend, dtype: float64

In [9]:
customer_total_spend.head()

CustomerID
C0001    3354.52
C0002    1862.74
C0003    2725.38
C0004    5354.88
C0005    2034.24
Name: TotalSpend, dtype: float64

In [10]:
# Feature Engineering: Aggregated customer-level features

# Calculate total spend per customer
customer_total_spend = full_data.groupby("CustomerID")["TotalValue"].sum().rename("TotalSpend")

# Calculate the total number of transactions per customer
customer_transaction_count = full_data.groupby("CustomerID")["TransactionID"].nunique().rename("TransactionCount")

# Find the most frequently purchased product category for each customer
customer_preferred_category = (
    full_data.groupby(["CustomerID", "Category"])["Quantity"].sum()
    .reset_index()
    .sort_values(["CustomerID", "Quantity"], ascending=[True, False])
    .drop_duplicates("CustomerID")[["CustomerID", "Category"]]
    .rename(columns={"Category": "PreferredCategory"})
)

# Combine these features into a single DataFrame
customer_features = (
    pd.merge(customer_total_spend, customer_transaction_count, on="CustomerID")
    .merge(customer_preferred_category, on="CustomerID")
)

# Display the engineered customer features
customer_features.head()


Unnamed: 0,CustomerID,TotalSpend,TransactionCount,PreferredCategory
0,C0001,3354.52,5,Electronics
1,C0002,1862.74,4,Home Decor
2,C0003,2725.38,4,Home Decor
3,C0004,5354.88,8,Home Decor
4,C0005,2034.24,3,Electronics


In [11]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Normalize numerical features (TotalSpend and TransactionCount)
scaler = MinMaxScaler()
numerical_features = customer_features[["TotalSpend", "TransactionCount"]]
normalized_numerical = scaler.fit_transform(numerical_features)

# One-hot encode the categorical feature (PreferredCategory)
encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
encoded_category = encoder.fit_transform(customer_features[["PreferredCategory"]])

# Combine normalized numerical features and encoded categorical features
final_features = pd.DataFrame(
    data=np.hstack([normalized_numerical, encoded_category]),
    index=customer_features["CustomerID"]
)

# Compute cosine similarity matrix for all customers
similarity_matrix = cosine_similarity(final_features)

# Get the first 20 customers (C0001 - C0020)
target_customers = customer_features[customer_features["CustomerID"].str.startswith("C00")][:20]

# Find the top 3 lookalike customers for each target customer
lookalike_results = {}
for idx, cust_id in enumerate(target_customers["CustomerID"]):
    # Get similarity scores for the current customer
    customer_similarities = similarity_matrix[idx]
    
    # Find the top 3 most similar customers (excluding the customer itself)
    similar_indices = customer_similarities.argsort()[-4:-1][::-1]  # Exclude self (top similarity)
    similar_scores = customer_similarities[similar_indices]
    similar_customers = final_features.index[similar_indices]
    
    # Save results in a dictionary
    lookalike_results[cust_id] = list(zip(similar_customers, similar_scores))

# Convert results into a DataFrame for export
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_results.keys(),
    "Lookalikes": [v for v in lookalike_results.values()]
})

# Save the results to Lookalike.csv
lookalike_csv_path = "Lookalike.csv"
lookalike_df.to_csv(lookalike_csv_path, index=False)

lookalike_csv_path




'Lookalike.csv'

In [12]:


# Combine normalized numerical features and encoded categorical features
final_features = pd.DataFrame(
    data=np.hstack([normalized_numerical, encoded_category]),
    index=customer_features["CustomerID"]
)

# Compute cosine similarity matrix for all customers
similarity_matrix = cosine_similarity(final_features)

# Get the first 20 customers (C0001 - C0020)
target_customers = customer_features[customer_features["CustomerID"].str.startswith("C00")][:20]

# Find the top 3 lookalike customers for each target customer
lookalike_results = {}
for idx, cust_id in enumerate(target_customers["CustomerID"]):
    # Get similarity scores for the current customer
    customer_similarities = similarity_matrix[idx]
    
    # Find the top 3 most similar customers (excluding the customer itself)
    similar_indices = customer_similarities.argsort()[-4:-1][::-1]  # Exclude self (top similarity)
    similar_scores = customer_similarities[similar_indices]
    similar_customers = final_features.index[similar_indices]
    
    # Save results in a dictionary
    lookalike_results[cust_id] = list(zip(similar_customers, similar_scores))

# Convert results into a DataFrame for export
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_results.keys(),
    "Lookalikes": [v for v in lookalike_results.values()]
})

# Save the results to Lookalike.csv
lookalike_csv_path = "Lookalike.csv"
lookalike_df.to_csv(lookalike_csv_path, index=False)

lookalike_csv_path


'Lookalike.csv'

In [14]:
import ast

# Load the Lookalike.csv file
lookalike_csv_path = "Lookalike.csv"
lookalike_data = pd.read_csv(lookalike_csv_path)

# Convert the "Lookalikes" column (stored as string) back to a list of tuples
lookalike_data["Lookalikes"] = lookalike_data["Lookalikes"].apply(ast.literal_eval)

# Function to fetch lookalike customers for a given CustomerID
def get_lookalikes(customer_id, lookalike_df):
    """
    Fetch the top 3 lookalike customers for the given CustomerID.

    Args:
        customer_id (str): The CustomerID for which lookalikes are needed.
        lookalike_df (pd.DataFrame): DataFrame containing the lookalike data.

    Returns:
        list of tuples: Lookalike CustomerIDs and their similarity scores.
    """
    result = lookalike_df[lookalike_df["CustomerID"] == customer_id]
    if not result.empty:
        return result.iloc[0]["Lookalikes"]
    else:
        return f"No lookalikes found for CustomerID {customer_id}"



In [19]:
# Example usage
example_customer_id = "C0020"
get_lookalikes(example_customer_id, lookalike_data)


[('C0130', 0.9995444526367542),
 ('C0140', 0.9911734812068119),
 ('C0197', 0.9768575792178675)]