In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# Task 2: Lookalike Model Using Folder Path

# Importing necessary libraries
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import files
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import os

# Step 1: Defining the folder path
folder_path = "/content/drive/MyDrive/ZeotapIntern"

# Step 2: Loading the datasets
customers_df = pd.read_csv(os.path.join(folder_path, 'Customers.csv'))
products_df = pd.read_csv(os.path.join(folder_path, 'Products.csv'))
transactions_df = pd.read_csv(os.path.join(folder_path, 'Transactions.csv'))

# Step 3: Preparing the data
# Merging Customers and Transactions datasets on 'CustomerID'
merged_df = pd.merge(transactions_df, customers_df, on='CustomerID', how='inner')

# Merging the result with Products dataset on 'ProductID'
final_df = pd.merge(merged_df, products_df, on='ProductID', how='inner')

# Aggregation of features for each customer
features_df = final_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Region': lambda x: x.mode()[0],
    'Category': lambda x: x.mode()[0]
}).reset_index()

# Converting categorical features into numerical format using one-hot encoding
features_df = pd.get_dummies(features_df, columns=['Region', 'Category'])

# Step 4: Normalizing the data
scaler = StandardScaler()
# Excluding CustomerID while normalizing
feature_columns = features_df.columns[1:]
features_scaled = scaler.fit_transform(features_df[feature_columns])

# Step 5: Calculation of customer similarity
# Computing cosine similarity for the normalized feature matrix
similarity_matrix = cosine_similarity(features_scaled)

# Step 6: Defining a function to find top N similar customers
def find_lookalikes(customer_id, top_n=3):
    # Location of the customer's index in the features DataFrame
    customer_index = features_df[features_df['CustomerID'] == customer_id].index[0]
    # Retrieving similarity scores for the customer
    similarity_scores = similarity_matrix[customer_index]
    # Sorting of scores in descending order and select the top N similar customers (excluding the customer itself)
    similar_indices = np.argsort(similarity_scores)[::-1][1:top_n + 1]
    similar_customers = [
        (features_df.iloc[i]['CustomerID'], similarity_scores[i]) for i in similar_indices
    ]
    return similar_customers

# Step 7: Generation of Lookalike.csv for the first 20 customers
lookalike_results = {}

# Iteraion over the first 20 customers to generate lookalike data
for customer_id in features_df['CustomerID'][:20]:
    lookalikes = find_lookalikes(customer_id)
    lookalike_results[customer_id] = lookalikes

# Preparing data for saving as a CSV file
lookalike_data = []

for customer_id, lookalikes in lookalike_results.items():
    for similar_customer, score in lookalikes:
        lookalike_data.append([customer_id, similar_customer, score])

# Creating a DataFrame and save the results to 'Lookalike.csv'
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'SimilarCustomerID', 'SimilarityScore'])
lookalike_df.to_csv('Lookalike.csv', index=False)

# Displaying a success message and preview the results
print("Lookalike model results saved as Lookalike.csv")
print(lookalike_df.head())
# Downloading the Lookalike.csv file
files.download('Lookalike.csv')

Lookalike model results saved as Lookalike.csv
  CustomerID SimilarCustomerID  SimilarityScore
0      C0001             C0184         0.998284
1      C0001             C0048         0.995289
2      C0001             C0190         0.990616
3      C0002             C0088         0.998952
4      C0002             C0092         0.980115


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>