# Import Necessary Libraries
Importing essential libraries for data manipulation, feature extraction, similarity calculation, and scaling. These tools help preprocess the dataset and calculate similarity scores efficiently

In [4]:
import pandas as pd
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

#Load and Merge Dataset

This function loads data from the three provided CSV files—customers, products, and transactions—and merges them into a single cohesive dataset. By dropping redundant columns and joining datasets based on common keys (CustomerID and ProductID), it ensures a clean and unified data structure for further analysis

In [13]:
def load_and_merge_data(customers_path, products_path, transactions_path):
    customers = pd.read_csv('/content/Customers.csv')
    products = pd.read_csv('/content/Products.csv')
    transactions = pd.read_csv('/content/Transactions.csv')
    products.drop(columns=['Price'], inplace=True)
    customer_transactions = pd.merge(transactions, customers, on='CustomerID', how='left')
    customer_transactions = pd.merge(customer_transactions, products, on='ProductID', how='left')
    customer_transactions.to_csv('customer_transactions.csv',index=False)
    return customer_transactions, customers, products

# Feature Engineering

This function generates aggregated features for each customer from the merged transaction dataset. These features provide valuable insights into customer behavior and preferences, which can later be used for similarity analysis and lookalike recommendations

In [14]:
def create_customer_features(customer_transactions):
    customer_features = customer_transactions.groupby('CustomerID').agg({
        'Price': 'sum',
        'ProductID': 'nunique',
        'Region': 'first',
        'SignupDate': 'first'
    }).reset_index()
    scaler = StandardScaler()
    customer_features[['Price_y', 'ProductID']] = scaler.fit_transform(customer_features[['Price', 'ProductID']])

    return customer_features


# Calculate product Similarities

This function computes similarity scores between products based on their names, leveraging text-based techniques to capture semantic similarities. These scores help identify related or similar products, which can further enhance customer segmentation and recommendation models.

In [15]:
def get_product_similarity(customer_transactions):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(customer_transactions['ProductName'].fillna(''))
    product_similarity = cosine_similarity(tfidf_matrix)

    return product_similarity


# Calculate Customer Similarity

This function computes a similarity score between two customers by combining their profile-based similarity and product purchase behavior similarity. It provides a holistic view of customer similarity, essential for building accurate lookalike models.

In [16]:
def calculate_similarity(customer_id1, customer_id2, customer_features, product_similarity):
    idx1 = customer_features[customer_features['CustomerID'] == customer_id1].index[0]
    idx2 = customer_features[customer_features['CustomerID'] == customer_id2].index[0]
    profile_similarity = cosine_similarity(customer_features[['Price', 'ProductID']].iloc[idx1:idx1+1],
                                           customer_features[['Price', 'ProductID']].iloc[idx2:idx2+1])[0][0]
    product_similarity_score = product_similarity[idx1, idx2]
    combined_similarity = 0.5 * profile_similarity + 0.5 * product_similarity_score

    return combined_similarity


# Generate lookalikes for Customers

This function identifies the top 3 lookalike customers for each of the first 20 customers based on their similarity scores. It leverages the combined profile and product similarity to deliver precise recommendations.

In [17]:
def generate_lookalikes(customer_features, product_similarity):
    lookalikes = defaultdict(list)

    for customer_id in customer_features['CustomerID'][:20]:
        similarities = []

        for other_customer_id in customer_features['CustomerID']:
            if customer_id != other_customer_id:
                similarity_score = calculate_similarity(customer_id, other_customer_id, customer_features, product_similarity)
                similarities.append((other_customer_id, similarity_score))
        top_3_lookalikes = sorted(similarities, key=lambda x: x[1], reverse=True)[:3]
        lookalikes[customer_id] = top_3_lookalikes
        print(top_3_lookalikes)
    return lookalikes


# Saving Lookalikes to csv
This function exports the lookalike recommendations into a CSV file, which can be easily used for further analysis or presentation. Each customer's top 3 lookalikes, along with their similarity scores, are saved in a structured format

In [18]:
def save_lookalikes_to_csv(lookalikes, output_path):
    lookalike_data = []
    for cust_id, recommendations in lookalikes.items():
        row = [cust_id]
        for recommendation in recommendations:
            row.append(recommendation[0])
            row.append(recommendation[1])
        lookalike_data.append(row)
    header = ['CustomerID', 'Lookalike1_ID', 'Score1', 'Lookalike2_ID', 'Score2', 'Lookalike3_ID', 'Score3']
    with open(output_path, mode='w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header)
        writer.writerows(lookalike_data)

    print(f"Lookalikes saved to {output_path}")


# Main Function - Bringing Everything together
The main function orchestrates the entire lookalike generation process. It integrates data loading, feature creation, similarity calculation, and CSV saving in a streamlined workflow.

In [19]:
def main():
    customers_path = 'Customers.csv'
    products_path = 'Products.csv'
    transactions_path = 'Transactions.csv'
    customer_transactions, customers, products = load_and_merge_data(customers_path, products_path, transactions_path)
    customer_features = create_customer_features(customer_transactions)
    product_similarity = get_product_similarity(customer_transactions)
    lookalikes = generate_lookalikes(customer_features, product_similarity)
    output_path = 'Mithun_V_Lookalike.csv'
    save_lookalikes_to_csv(lookalikes, output_path)
main()


[('C0011', 0.9999999999992286), ('C0012', 0.9999999855247398), ('C0006', 0.9999999684300638)]
[('C0003', 0.9999999997063942), ('C0010', 0.9999999915076985), ('C0006', 0.9999999911434667)]
[('C0002', 0.9999999997063942), ('C0010', 0.9999999943721862), ('C0006', 0.9999999876247541)]
[('C0008', 0.9999999800623312), ('C0013', 0.9999999743404847), ('C0012', 0.9999999403501596)]
[('C0009', 0.9999999998456444), ('C0007', 0.999999992654997), ('C0010', 0.9999999719261146)]
[('C0002', 0.9999999911434667), ('C0003', 0.9999999876247541), ('C0011', 0.9999999687414058)]
[('C0009', 0.9999999946301884), ('C0010', 0.9999999933006363), ('C0005', 0.999999992654997)]
[('C0004', 0.9999999800623312), ('C0013', 0.9999999091660667), ('C0012', 0.9999998514406535)]
[('C0005', 0.9999999998456444), ('C0007', 0.9999999946301884), ('C0010', 0.9999999759351053)]
[('C0003', 0.9999999943721862), ('C0007', 0.9999999933006363), ('C0002', 0.9999999915076985)]
[('C0001', 0.9999999999992286), ('C0012', 0.9999999853126249),