<a href="https://colab.research.google.com/github/Gk787/DS-Project/blob/main/Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [None]:
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

In [None]:
data = transactions.merge(customers, on='CustomerID', how='left')
data = data.merge(products, on='ProductID', how='left')

In [None]:
customer_profiles = data.groupby('CustomerID').agg({
    'Region': 'first',
    'Category': lambda x: ' '.join(x),
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()

In [None]:
scaler = MinMaxScaler()
customer_profiles[['TotalValue', 'Quantity']] = scaler.fit_transform(
    customer_profiles[['TotalValue', 'Quantity']]
)

In [None]:
customer_profiles['Features'] = (
    customer_profiles['Region'] + ' ' + customer_profiles['Category']
)

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(customer_profiles['Features'])

In [None]:
numerical_features = customer_profiles[['TotalValue', 'Quantity']].values
combined_features = pd.concat(
    [
        pd.DataFrame(tfidf_matrix.toarray(), index=customer_profiles['CustomerID']),
        pd.DataFrame(numerical_features, index=customer_profiles['CustomerID']),
    ],
    axis=1,
)

In [None]:
similarity_matrix = cosine_similarity(combined_features)

In [None]:
lookalike_map = {}

for idx, customer_id in enumerate(customer_profiles['CustomerID']):
    similarity_scores = list(enumerate(similarity_matrix[idx]))

    similar_customers = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    top_3 = [
        (customer_profiles['CustomerID'].iloc[i[0]], round(i[1], 4))
        for i in similar_customers
        if customer_profiles['CustomerID'].iloc[i[0]] != customer_id
    ][:3]
    lookalike_map[customer_id] = top_3

In [None]:
lookalike_results = []

for customer_id in customer_profiles['CustomerID']:
    if customer_id in ['C000' + str(i).zfill(1) for i in range(1, 21)]:
        lookalike_results.append({
            'CustomerID': customer_id,
            'Lookalikes': str(lookalike_map[customer_id])
        })

In [None]:
import os

os.makedirs('/mnt/data', exist_ok=True)

lookalike_df = pd.DataFrame(lookalike_results)
lookalike_df.to_csv('/mnt/data/Lookalike.csv', index=False)

print("Lookalike.csv has been created successfully!")

Lookalike.csv has been created successfully!
