**Task 2: Lookalike Model**

**Step 1: Setting Up Google Colab**

**Connect to Google Drive**

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Navigate to Your Dataset Folder**

In [8]:
import os
os.listdir("/content/drive/My Drive/Zeotap/")

['Customers.csv',
 'Products.csv',
 'Transactions.csv',
 'EDA_Report.txt',
 'Merged file.csv']

**Step 2: Install Required Libraries**

In [9]:
!pip install pandas numpy matplotlib seaborn scikit-learn



**Step 3: Load the Data**

In [11]:
import pandas as pd

# Define the path to your dataset folder
dataset_folder = "/content/drive/My Drive/Zeotap/"

# Load the datasets using the full path
customers = pd.read_csv(dataset_folder + "Customers.csv")
products = pd.read_csv(dataset_folder + "Products.csv")
transactions = pd.read_csv(dataset_folder + "Transactions.csv")

# Display first few rows
print(customers.head())
print(products.head())
print(transactions.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067   2024-04-25 7:38:55         1   
3       

**Step 4: Data Preprocessing**

In [12]:
# Merge transactions with products to get product details
transactions = transactions.merge(products, on="ProductID", how="left")

# Merge transactions with customers to get customer details
data = transactions.merge(customers, on="CustomerID", how="left")

# Display first few rows
print(data.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067   2024-04-25 7:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x                      ProductName     Category  Price_y  \
0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
1      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
2      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
3      601.36   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
4      902.04   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   

      CustomerName         Region  SignupDate  
0   Andrea Jenkins         Europe  202

**Step 5: Feature Engineering**

In [13]:
# Aggregating transaction history per customer
customer_profile = data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spending
    "Quantity": "sum",    # Total products purchased
    "Category": lambda x: list(x),  # List of categories purchased
    "Region": "first"  # Customer location
}).reset_index()

print(customer_profile.head())


  CustomerID  TotalValue  Quantity  \
0      C0001     3354.52        12   
1      C0002     1862.74        10   
2      C0003     2725.38        14   
3      C0004     5354.88        23   
4      C0005     2034.24         7   

                                            Category         Region  
0  [Books, Home Decor, Electronics, Electronics, ...  South America  
1       [Home Decor, Home Decor, Clothing, Clothing]           Asia  
2    [Home Decor, Home Decor, Clothing, Electronics]  South America  
3  [Books, Home Decor, Home Decor, Home Decor, Bo...  South America  
4             [Home Decor, Electronics, Electronics]           Asia  


**Step 6: Encode Categorical Variables**

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder

# Encoding region
le = LabelEncoder()
customer_profile["Region"] = le.fit_transform(customer_profile["Region"])

# Encoding category using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
category_encoded = pd.DataFrame(mlb.fit_transform(customer_profile["Category"]),
                                columns=mlb.classes_)

# Concatenating encoded category data
customer_profile = pd.concat([customer_profile.drop("Category", axis=1), category_encoded], axis=1)

print(customer_profile.head())


  CustomerID  TotalValue  Quantity  Region  Books  Clothing  Electronics  \
0      C0001     3354.52        12       3      1         0            1   
1      C0002     1862.74        10       0      0         1            0   
2      C0003     2725.38        14       3      0         1            1   
3      C0004     5354.88        23       3      1         0            1   
4      C0005     2034.24         7       0      0         0            1   

   Home Decor  
0           1  
1           1  
2           1  
3           1  
4           1  


**Step 7: Compute Customer Similarity**

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrix
similarity_matrix = cosine_similarity(customer_profile.drop(columns=["CustomerID"]))

# Convert similarity matrix into a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profile["CustomerID"], columns=customer_profile["CustomerID"])

print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.999998  0.999999  1.000000  1.000000  1.000000   
C0002       0.999998  1.000000  0.999999  0.999999  0.999998  0.999996   
C0003       0.999999  0.999999  1.000000  0.999999  0.999998  0.999997   
C0004       1.000000  0.999999  0.999999  1.000000  0.999999  0.999999   
C0005       1.000000  0.999998  0.999998  0.999999  1.000000  0.999999   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.999999  0.999999  0.999999  0.999994  ...  1.000000  1.000000   
C0002       0.999997  1.000000  0.999996  0.999998  ...  0.999998  0.999997   
C0003       0.999997  1.000000  0.999998  0.999998  ...  0.999999  0.999999   
C0004       0.999999  1.000000  0.999998  0.999996  ...  1.000000  0.999999   
C0005  

**Step 8: Recommend Top 3 Lookalikes**

In [20]:
lookalikes = {}

for customer in customer_profile["CustomerID"]:
    similar_customers = similarity_df[customer].nlargest(4).iloc[1:]  # Top 3 excluding self
    formatted_lookalikes = ", ".join([f"{cust}:{score:.4f}" for cust, score in zip(similar_customers.index, similar_customers.values)])

    lookalikes[customer] = formatted_lookalikes

# Convert to DataFrame
lookalike_df = pd.DataFrame(lookalikes.items(), columns=["CustomerID", "Lookalikes"])

# Save as CSV
lookalike_df.to_csv("Lookalike_Corrected.csv", index=False)

# Display first few rows
print(lookalike_df.head())


  CustomerID                                Lookalikes
0      C0001  C0191:1.0000, C0035:1.0000, C0012:1.0000
1      C0002  C0134:1.0000, C0062:1.0000, C0109:1.0000
2      C0003  C0031:1.0000, C0147:1.0000, C0190:1.0000
3      C0004  C0169:1.0000, C0039:1.0000, C0165:1.0000
4      C0005  C0007:1.0000, C0069:1.0000, C0177:1.0000


**Step 9: Save the Lookalike Recommendations**

In [21]:
# Save as CSV
lookalike_df.to_csv("Lookalike.csv", index=False)

**Step 10: Download Lookalike.csv File**

In [23]:
from google.colab import files
files.download("Lookalike.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>