In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.neighbors import NearestNeighbors

In [2]:
# Load Data
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

In [3]:
customers.shape

(200, 4)

In [4]:
products.shape

(100, 4)

In [5]:
transactions.shape

(1000, 7)

In [6]:
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [7]:
products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [8]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [9]:
customers.isnull().sum()

Unnamed: 0,0
CustomerID,0
CustomerName,0
Region,0
SignupDate,0


In [10]:
products.isnull().sum()

Unnamed: 0,0
ProductID,0
ProductName,0
Category,0
Price,0


In [11]:
transactions.isnull().sum()

Unnamed: 0,0
TransactionID,0
CustomerID,0
ProductID,0
TransactionDate,0
Quantity,0
TotalValue,0
Price,0


In [17]:
# Task 2: Lookalike Model
# Prepare customer profiles
customer_profiles = data.groupby('CustomerID').agg({'TotalValue': 'sum', 'Quantity': 'sum'})
scaler = StandardScaler()
customer_profiles_scaled = scaler.fit_transform(customer_profiles)

In [18]:
# Finding similar customers
nn = NearestNeighbors(n_neighbors=4, metric='euclidean')
nn.fit(customer_profiles_scaled)

In [19]:
lookalike_results = {}

# Ensure n_neighbors is at least 4 (1 self + 3 lookalikes)
for i in range(20):
    distances, indices = nn.kneighbors([customer_profiles_scaled[i]])
    lookalike_results[customers.iloc[i]['CustomerID']] = [
        (customers.iloc[idx]['CustomerID'], distances[0][j]) for j, idx in enumerate(indices[0][1:4])
    ]

# Convert lookalike results into a DataFrame
lookalike_df = pd.DataFrame.from_dict(
    lookalike_results,
    orient='index',
    columns=['Lookalike1', 'Lookalike2', 'Lookalike3']
)

# Save to CSV
lookalike_df.to_csv('Pavankumar_Gogula_Lookalike.csv', index_label='CustomerID')