#### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


#### Load the data

In [2]:
# Load the datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Display the first few rows of each to ensure data is loaded correctly
print(customers_df.head())
print(products_df.head())
print(transactions_df.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067   2024-04-25 7:38:55         1   
3       

#### Data Preprocessing

Customer Profile Features: We can encode customer profile information, such as Region and calculate the number of days since the SignupDate.

In [10]:
# Encoding categorical data (Region)

customers_df['Region'] = customers_df['Region'].astype('category').cat.codes


# Calculate days since signup
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
customers_df['DaysSinceSignup'] = (pd.to_datetime('today') - customers_df['SignupDate']).dt.days


Transaction History Features: Aggregate transaction data to create a transaction profile per customer.

In [4]:
# Aggregating transaction data per customer
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
transaction_agg = transactions_df.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    total_transactions=('TransactionID', 'count'),
    unique_products=('ProductID', 'nunique')
).reset_index()

# Merging transaction features with customer data
customer_data = customers_df.merge(transaction_agg, on='CustomerID', how='left')

# Features for the model
customer_features = customer_data[['Region', 'DaysSinceSignup', 'total_spend', 'total_transactions', 'unique_products']]



In [5]:
# Check for missing values
print(customer_features.isnull().sum())

Region                0
DaysSinceSignup       0
total_spend           1
total_transactions    1
unique_products       1
dtype: int64


In [6]:
# Option 1: Fill missing values with the mean
customer_features = customer_features.fillna(customer_features.mean())

#### Normalize the data (Scaling)

In [7]:
# Since different features (e.g., total_spend and Region) might have different scales, it's important to normalize the data.
# Normalize the features
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features)


#### Calculate cosine similarity

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(customer_features_scaled)

# Function to get top 3 similar customers for a given customer
def get_top_lookalikes(customer_id, cosine_sim_matrix, customer_data, top_n=3):
    customer_index = customer_data[customer_data['CustomerID'] == customer_id].index[0]
    similarity_scores = list(enumerate(cosine_sim_matrix[customer_index]))
    
    # Sort customers by similarity score in descending order, excluding self (index == customer_index)
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    lookalikes = []
    for idx, score in sorted_scores[1:top_n+1]:  # Skipping the first result (self)
        lookalikes.append((customer_data.iloc[idx]['CustomerID'], score))
    
    return lookalikes

# Generate Lookalikes for customers C0001 to C0020
lookalike_results = {}
for i in range(1, 21):
    customer_id = f'C{i:04d}'
    lookalikes = get_top_lookalikes(customer_id, cosine_sim, customer_data)
    lookalike_results[customer_id] = lookalikes


In [13]:

# Create a DataFrame to save the results
lookalike_df = pd.DataFrame({
    'cust_id': list(lookalike_results.keys()),
    'lookalikes': [str(lookalike_results[cust_id]) for cust_id in lookalike_results]
})

# Save the result to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display a preview of the saved results
print(lookalike_df.head())


  cust_id                                         lookalikes
0   C0001  [('C0152', 0.9950433733482509), ('C0174', 0.96...
1   C0002  [('C0027', 0.9445585739168655), ('C0134', 0.94...
2   C0003  [('C0031', 0.9773487825635152), ('C0052', 0.96...
3   C0004  [('C0108', 0.9813752769127972), ('C0102', 0.97...
4   C0005  [('C0159', 0.9998906371426377), ('C0007', 0.98...
