In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load datasets
customers = pd.read_csv("/content/drive/MyDrive/Zeotapfiles/Customers.csv")
transactions = pd.read_csv("/content/drive/MyDrive/Zeotapfiles/Transactions.csv")
products = pd.read_csv("/content/drive/MyDrive/Zeotapfiles/Products.csv")

In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity


In [5]:
# Merge Customers and Transactions on CustomerID
merged_data = pd.merge(transactions, customers, on="CustomerID")


In [6]:
# Create customer profiles
customer_profiles = merged_data.groupby("CustomerID").agg(
    TotalSpend=("TotalValue", "sum"),
    AvgTransactionValue=("TotalValue", "mean"),
    FavoriteCategory=("ProductID", lambda x: x.mode()[0]),  # Most frequent category
    Region=("Region", "first"),  # Region is the same for each customer
    SignupDate=("SignupDate", "first")  # SignupDate is the same for each customer
).reset_index()

In [7]:
print(customer_profiles.columns)

Index(['CustomerID', 'TotalSpend', 'AvgTransactionValue', 'FavoriteCategory',
       'Region', 'SignupDate'],
      dtype='object')


In [8]:
#Display customer profiles
print(customer_profiles.head())


  CustomerID  TotalSpend  AvgTransactionValue FavoriteCategory         Region  \
0      C0001     3354.52              670.904             P022  South America   
1      C0002     1862.74              465.685             P004           Asia   
2      C0003     2725.38              681.345             P002  South America   
3      C0004     5354.88              669.360             P008  South America   
4      C0005     2034.24              678.080             P012           Asia   

   SignupDate  
0  2022-07-10  
1  2022-02-13  
2  2024-03-07  
3  2022-10-09  
4  2022-08-15  


**Step 2: Feature Engineering**

In [9]:
encoder = OneHotEncoder(sparse_output=False, drop="first")  # Use sparse_output
encoded_features = encoder.fit_transform(customer_profiles[["Region", "FavoriteCategory"]])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(["Region", "FavoriteCategory"]))


In [10]:
print(encoded_df)

     Region_Europe  Region_North America  Region_South America  \
0              0.0                   0.0                   1.0   
1              0.0                   0.0                   0.0   
2              0.0                   0.0                   1.0   
3              0.0                   0.0                   1.0   
4              0.0                   0.0                   0.0   
..             ...                   ...                   ...   
194            1.0                   0.0                   0.0   
195            1.0                   0.0                   0.0   
196            1.0                   0.0                   0.0   
197            1.0                   0.0                   0.0   
198            0.0                   0.0                   0.0   

     FavoriteCategory_P002  FavoriteCategory_P003  FavoriteCategory_P004  \
0                      0.0                    0.0                    0.0   
1                      0.0                    0.0      

In [11]:
# Normalize numerical features (TotalSpend, AvgTransactionValue)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profiles[["TotalSpend", "AvgTransactionValue"]])
scaled_df = pd.DataFrame(scaled_features, columns=["TotalSpend", "AvgTransactionValue"])


In [12]:
# Combine encoded and scaled features
features = pd.concat([customer_profiles["CustomerID"], encoded_df, scaled_df], axis=1)


In [13]:
# Display engineered features
print(features.head())

  CustomerID  Region_Europe  Region_North America  Region_South America  \
0      C0001            0.0                   0.0                   1.0   
1      C0002            0.0                   0.0                   0.0   
2      C0003            0.0                   0.0                   1.0   
3      C0004            0.0                   0.0                   1.0   
4      C0005            0.0                   0.0                   0.0   

   FavoriteCategory_P002  FavoriteCategory_P003  FavoriteCategory_P004  \
0                    0.0                    0.0                    0.0   
1                    0.0                    0.0                    1.0   
2                    1.0                    0.0                    0.0   
3                    0.0                    0.0                    0.0   
4                    0.0                    0.0                    0.0   

   FavoriteCategory_P005  FavoriteCategory_P006  FavoriteCategory_P007  ...  \
0                    0.0 

**Step 3: Build Similarity Model**

In [14]:
# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(features.drop("CustomerID", axis=1))

In [15]:
# Convert similarity matrix to a DataFrame for easier lookup
similarity_df = pd.DataFrame(similarity_matrix, index=features["CustomerID"], columns=features["CustomerID"])

In [16]:
# Display similarity matrix
print(similarity_df.head())

CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.052000  0.492369  0.379079  0.028406  0.284740   
C0002       0.052000  1.000000  0.159130 -0.292791  0.350937 -0.522098   
C0003       0.492369  0.159130  1.000000  0.226020  0.170637  0.250571   
C0004       0.379079 -0.292791  0.226020  1.000000 -0.361893  0.348686   
C0005       0.028406  0.350937  0.170637 -0.361893  1.000000 -0.142503   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001      -0.011197  0.019120  0.055186  0.047216  ...  0.670136  0.440304   
C0002      -0.117093  0.547018  0.658995  0.564052  ...  0.320620  0.467576   
C0003       0.091077 -0.055206  0.161619  0.139974  ...  0.690074  0.513417   
C0004      -0.239349  0.166690 -0.293212 -0.254958  ...  0.397026  0.086668   
C0005  

**Step 4: Generate Recommendations**

In [17]:
# Function to get top 3 similar customers
def get_top_similar_customers(customer_id, similarity_df, top_n=3):
    # Exclude the customer itself
    similar_customers = similarity_df[customer_id].drop(customer_id)
    # Sort by similarity score and get top N
    top_similar = similar_customers.sort_values(ascending=False).head(top_n).index.tolist()
    return top_similar

In [18]:
# Generate recommendations for the first 20 customers
lookalike_dict = {}
for customer_id in features["CustomerID"].head(20):
    lookalike_dict[customer_id] = get_top_similar_customers(customer_id, similarity_df)

In [19]:
# Convert dictionary to DataFrame for CSV export
lookalike_df = pd.DataFrame(lookalike_dict.items(), columns=["CustomerID", "Top3Lookalikes"])
lookalike_df["Top3Lookalikes"] = lookalike_df["Top3Lookalikes"].apply(lambda x: ", ".join(x))

In [22]:
# Save to CSV
lookalike_df.to_csv("Jatin_Gahlot_Lookalike.csv", index=False)


In [21]:
# Display recommendations
print(lookalike_df)

   CustomerID       Top3Lookalikes
0       C0001  C0025, C0191, C0137
1       C0002  C0030, C0097, C0060
2       C0003  C0181, C0031, C0191
3       C0004  C0175, C0022, C0165
4       C0005  C0123, C0130, C0150
5       C0006  C0040, C0168, C0096
6       C0007  C0112, C0080, C0078
7       C0008  C0030, C0024, C0139
8       C0009  C0019, C0060, C0121
9       C0010  C0019, C0060, C0121
10      C0011  C0171, C0191, C0048
11      C0012  C0104, C0099, C0165
12      C0013  C0101, C0082, C0188
13      C0014  C0019, C0060, C0009
14      C0015  C0036, C0058, C0144
15      C0016  C0046, C0018, C0026
16      C0017  C0075, C0105, C0124
17      C0018  C0026, C0079, C0117
18      C0019  C0060, C0119, C0121
19      C0020  C0080, C0078, C0110
