# Step 1 : Data Preprocessing

In [11]:
import pandas as pd

# Load datasets
customers = pd.read_csv('C:/Users/Admin/Desktop/Customers.csv')
products = pd.read_csv('C:/Users/Admin/Desktop/products.csv')
transactions = pd.read_csv('C:/Users/Admin/Desktop/transactions.csv')


In [12]:
# Merge datasets to get a comprehensive view
data = pd.merge(transactions, customers, on='CustomerID')
data = pd.merge(data, products, on='ProductID')

In [13]:
# Convert dates to datetime format
data['SignupDate'] = pd.to_datetime(data['SignupDate'])
data['TransactionDate'] = pd.to_datetime(data['TransactionDate'])

In [14]:
# Display the first few rows of the merged dataset
print(data.head())

  TransactionID CustomerID ProductID     TransactionDate  Quantity  \
0        T00001      C0199      P067 2024-08-25 12:38:23         1   
1        T00112      C0146      P067 2024-05-27 22:23:54         1   
2        T00166      C0127      P067 2024-04-25 07:38:55         1   
3        T00272      C0087      P067 2024-03-26 22:55:37         2   
4        T00363      C0070      P067 2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe 2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia 2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe 2024-04-04   
3      601.36   300.68  Travis Campbell  South America 2024-04-11   
4      902.04   300.68    Timothy Perez         Europe 2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker  Electronic

# Step 2: Feature Engineering

We will create features that capture customer behavior and product preferences.

In [15]:
# Feature 1: Total spending per customer
total_spending = data.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_spending.columns = ['CustomerID', 'TotalSpending']

In [16]:
# Feature 2: Average transaction value per customer
avg_transaction_value = data.groupby('CustomerID')['TotalValue'].mean().reset_index()
avg_transaction_value.columns = ['CustomerID', 'AvgTransactionValue']


In [17]:
# Feature 3: Number of transactions per customer
num_transactions = data.groupby('CustomerID')['TransactionID'].count().reset_index()
num_transactions.columns = ['CustomerID', 'NumTransactions']


In [18]:
# Feature 4: Favorite product category (most purchased category)
favorite_category = data.groupby(['CustomerID', 'Category']).size().reset_index(name='Count')
favorite_category = favorite_category.loc[favorite_category.groupby('CustomerID')['Count'].idxmax()]
favorite_category = favorite_category[['CustomerID', 'Category']]
favorite_category.columns = ['CustomerID', 'FavoriteCategory']

In [19]:
# Merge all features into a single dataframe
customer_features = pd.merge(total_spending, avg_transaction_value, on='CustomerID')
customer_features = pd.merge(customer_features, num_transactions, on='CustomerID')
customer_features = pd.merge(customer_features, favorite_category, on='CustomerID')

In [21]:
# Display the first few rows of the customer features dataframe
print(customer_features.head())

  CustomerID  TotalSpending  AvgTransactionValue  NumTransactions  \
0      C0001        3354.52              670.904                5   
1      C0002        1862.74              465.685                4   
2      C0003        2725.38              681.345                4   
3      C0004        5354.88              669.360                8   
4      C0005        2034.24              678.080                3   

  FavoriteCategory  
0      Electronics  
1         Clothing  
2       Home Decor  
3            Books  
4      Electronics  


# Step 3: Similarity Calculation

We will use cosine similarity to calculate the similarity between customers.

In [28]:
print(customer_features.dtypes)

CustomerID              object
TotalSpending          float64
AvgTransactionValue    float64
NumTransactions          int64
FavoriteCategory        object
dtype: object


In [29]:
from sklearn.preprocessing import OneHotEncoder

# Identify categorical columns
categorical_columns = customer_features.select_dtypes(include=['object', 'category']).columns

# One-hot encode all categorical columns
encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid multicollinearity
encoded_features = encoder.fit_transform(customer_features[categorical_columns])

# Convert encoded features to a dataframe
encoded_features_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))

# Drop original categorical columns and concatenate encoded columns
customer_features_encoded = customer_features.drop(categorical_columns, axis=1)
customer_features_encoded = pd.concat([customer_features_encoded, encoded_features_df], axis=1)

# Display the first few rows of the encoded dataframe
print(customer_features_encoded.head())

   TotalSpending  AvgTransactionValue  NumTransactions  CustomerID_C0002  \
0        3354.52              670.904                5               0.0   
1        1862.74              465.685                4               1.0   
2        2725.38              681.345                4               0.0   
3        5354.88              669.360                8               0.0   
4        2034.24              678.080                3               0.0   

   CustomerID_C0003  CustomerID_C0004  CustomerID_C0005  CustomerID_C0006  \
0               0.0               0.0               0.0               0.0   
1               0.0               0.0               0.0               0.0   
2               1.0               0.0               0.0               0.0   
3               0.0               1.0               0.0               0.0   
4               0.0               0.0               1.0               0.0   

   CustomerID_C0007  CustomerID_C0008  ...  CustomerID_C0194  \
0               

In [30]:
from sklearn.preprocessing import StandardScaler

# Normalize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features_encoded.iloc[:, 1:])  # Exclude 'CustomerID'

# Convert scaled features back to a dataframe (optional)
scaled_features_df = pd.DataFrame(scaled_features, columns=customer_features_encoded.columns[1:])

# Display the first few rows of the scaled features
print(scaled_features_df.head())

   AvgTransactionValue  NumTransactions  CustomerID_C0002  CustomerID_C0003  \
0            -0.070263        -0.011458         -0.071067         -0.071067   
1            -0.934933        -0.467494         14.071247         -0.071067   
2            -0.026271        -0.467494         -0.071067         14.071247   
3            -0.076769         1.356650         -0.071067         -0.071067   
4            -0.040028        -0.923530         -0.071067         -0.071067   

   CustomerID_C0004  CustomerID_C0005  CustomerID_C0006  CustomerID_C0007  \
0         -0.071067         -0.071067         -0.071067         -0.071067   
1         -0.071067         -0.071067         -0.071067         -0.071067   
2         -0.071067         -0.071067         -0.071067         -0.071067   
3         14.071247         -0.071067         -0.071067         -0.071067   
4         -0.071067         14.071247         -0.071067         -0.071067   

   CustomerID_C0008  CustomerID_C0009  ...  CustomerID_C0194  

In [32]:
# Calculate cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

In [33]:
# Convert the similarity matrix to a dataframe
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


In [34]:
# Display the first few rows of the similarity dataframe
print(similarity_df.head())

CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000 -0.052239 -0.053536 -0.014362  0.125155 -0.017286   
C0002      -0.052239  1.000000 -0.012143 -0.009921 -0.011147 -0.013217   
C0003      -0.053536 -0.012143  1.000000 -0.010157 -0.011219 -0.006148   
C0004      -0.014362 -0.009921 -0.010157  1.000000 -0.013317 -0.004512   
C0005       0.125155 -0.011147 -0.011219 -0.013317  1.000000 -0.005321   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.123308 -0.051390 -0.050227 -0.051863  ... -0.013272  0.126515   
C0002      -0.014641 -0.013307  0.024039  0.020602  ... -0.005437 -0.009107   
C0003      -0.011303  0.011040 -0.010941 -0.012114  ... -0.006992 -0.012182   
C0004      -0.013590  0.008444 -0.012630 -0.009856  ... -0.000723 -0.010014   
C0005  

# Step 4: Model Development

We will implement the Lookalike Model to recommend similar customers.

In [49]:
def get_lookalikes(customer_id, similarity_df, top_n=3):
    # Get the similarity scores for the given customer
    
    similarities = similarity_df[customer_id]
    
    # Sort the similarities in descending order
    sorted_similarities = similarities.sort_values(ascending=False)
    
    # Exclude the customer itself and get the top N similar customers
    lookalikes = sorted_similarities.iloc[1:top_n+1]
    
    return lookalikes

# Example: Get lookalikes for customer C0001
lookalikes = get_lookalikes('C0001', similarity_df)
print(lookalikes)

CustomerID
C0119    0.128056
C0157    0.127366
C0199    0.126686
Name: C0001, dtype: float64


# Step 5: Model Evaluation

We will generate recommendations and similarity scores for the first 20 customers.

In [39]:
# Initialize a dictionary to store the results
lookalike_map = {}

In [41]:
# Get the first 20 customers
first_20_customers = customer_features['CustomerID'].head(20)
first_20_customers

0     C0001
1     C0002
2     C0003
3     C0004
4     C0005
5     C0006
6     C0007
7     C0008
8     C0009
9     C0010
10    C0011
11    C0012
12    C0013
13    C0014
14    C0015
15    C0016
16    C0017
17    C0018
18    C0019
19    C0020
Name: CustomerID, dtype: object

In [42]:
# Generate lookalikes for each customer
for customer_id in first_20_customers:
    lookalikes = get_lookalikes(customer_id, similarity_df)
    lookalike_map[customer_id] = lookalikes.to_dict()

In [43]:
# Display the lookalike map
print(lookalike_map)

{'C0001': {'C0119': 0.12805640750097697, 'C0157': 0.12736560475450195, 'C0199': 0.12668552955867923}, 'C0002': {'C0033': 0.0289363465792003, 'C0083': 0.025581969563973513, 'C0077': 0.024401428237390957}, 'C0003': {'C0014': 0.02041374940399066, 'C0151': 0.02039641208093111, 'C0097': 0.020367529582731914}, 'C0004': {'C0156': 0.01729541332433302, 'C0175': 0.014461937927148684, 'C0147': 0.011876157668777908}, 'C0005': {'C0001': 0.12515511413781072, 'C0130': 0.022820407571882955, 'C0020': 0.022510349487025245}, 'C0006': {'C0168': 0.018784394811009353, 'C0185': 0.015972986294934884, 'C0040': 0.015231004377908522}, 'C0007': {'C0001': 0.12330751361354267, 'C0066': 0.02702701366113619, 'C0020': 0.02616465025110942}, 'C0008': {'C0065': 0.038710855764238064, 'C0113': 0.03271922665220874, 'C0172': 0.027968912848337475}, 'C0009': {'C0033': 0.040626581851552816, 'C0083': 0.034230945393242285, 'C0058': 0.032227905340962745}, 'C0010': {'C0033': 0.030633111678387497, 'C0083': 0.02704253678503426, 'C007

# Final Output

In [44]:
# Convert the lookalike map to a dataframe
lookalike_df = pd.DataFrame.from_dict(lookalike_map, orient='index')

In [45]:
# Save the dataframe to a CSV file
lookalike_df.to_csv('Lookalike.csv')

In [46]:
# Display the first few rows of the lookalike dataframe
print(lookalike_df.head())

          C0119     C0157     C0199     C0033     C0083     C0077  C0014  \
C0001  0.128056  0.127366  0.126686       NaN       NaN       NaN    NaN   
C0002       NaN       NaN       NaN  0.028936  0.025582  0.024401    NaN   
C0009       NaN       NaN       NaN  0.040627  0.034231       NaN    NaN   
C0010       NaN       NaN       NaN  0.030633  0.027043  0.025663    NaN   
C0015       NaN       NaN       NaN  0.010415       NaN       NaN    NaN   

       C0151     C0097  C0156  ...  C0059  C0141  C0109  C0099     C0060  \
C0001    NaN       NaN    NaN  ...    NaN    NaN    NaN    NaN       NaN   
C0002    NaN       NaN    NaN  ...    NaN    NaN    NaN    NaN       NaN   
C0009    NaN       NaN    NaN  ...    NaN    NaN    NaN    NaN       NaN   
C0010    NaN       NaN    NaN  ...    NaN    NaN    NaN    NaN       NaN   
C0015    NaN  0.010483    NaN  ...    NaN    NaN    NaN    NaN  0.010969   

       C0200  C0082  C0121  C0161  C0140  
C0001    NaN    NaN    NaN    NaN    NaN  
