In [1]:
!pip install pandas numpy scikit-learn matplotlib seaborn

Collecting numpy
  Downloading numpy-1.24.4-cp39-cp39-win_amd64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp39-cp39-win_amd64.whl (14.9 MB)
   ---------------------------------------- 14.9/14.9 MB 3.4 MB/s eta 0:00:00
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
Successfully installed numpy-1.24.4


  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
daal4py 2021.6.0 requires daal==2021.4.0, which is not installed.
numba 0.55.1 requires numpy<1.22,>=1.18, but you have numpy 1.24.4 which is incompatible.


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [3]:
customers = pd.read_csv(r"C:\Users\nanin\OneDrive\Desktop\Customers (1).csv")
products = pd.read_csv(r"C:\Users\nanin\OneDrive\Desktop\Products.csv")
transactions = pd.read_csv(r"C:\Users\nanin\OneDrive\Desktop\Transactions.csv")

In [4]:
print("Customers Dataset:\n", customers.head())
print("\nProducts Dataset:\n", products.head())
print("\nTransactions Dataset:\n", transactions.head())

Customers Dataset:
   CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Products Dataset:
   ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Transactions Dataset:
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166

In [5]:
print("\nMissing values in Customers:\n", customers.isnull().sum())
print("\nMissing values in Products:\n", products.isnull().sum())
print("\nMissing values in Transactions:\n", transactions.isnull().sum())


Missing values in Customers:
 CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64

Missing values in Products:
 ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64

Missing values in Transactions:
 TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64


In [6]:
print("\nCustomers Dataset Statistics:\n", customers.describe(include='all'))
print("\nProducts Dataset Statistics:\n", products.describe(include='all'))
print("\nTransactions Dataset Statistics:\n", transactions.describe())


Customers Dataset Statistics:
        CustomerID      CustomerName         Region  SignupDate
count         200               200            200         200
unique        200               200              4         179
top         C0001  Lawrence Carroll  South America  2024-11-11
freq            1                 1             59           3

Products Dataset Statistics:
        ProductID            ProductName Category       Price
count        100                    100      100  100.000000
unique       100                     66        4         NaN
top         P001  ActiveWear Smartwatch    Books         NaN
freq           1                      4       26         NaN
mean         NaN                    NaN      NaN  267.551700
std          NaN                    NaN      NaN  143.219383
min          NaN                    NaN      NaN   16.080000
25%          NaN                    NaN      NaN  147.767500
50%          NaN                    NaN      NaN  292.875000
75%         

In [7]:
merged_data = pd.merge(transactions, customers, on='CustomerID', how='left')
merged_data = pd.merge(merged_data, products, on='ProductID', how='left')

In [8]:
print("\nMerged Dataset:\n", merged_data.head())


Merged Dataset:
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving

In [9]:
customer_summary = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()

In [10]:
customer_data = pd.merge(customer_summary, customers, on='CustomerID', how='left')

In [11]:
customer_data_encoded = pd.get_dummies(customer_data, columns=['Region'], drop_first=True)
scaler = StandardScaler()
numerical_features = ['TotalValue', 'Quantity']
customer_data_encoded[numerical_features] = scaler.fit_transform(customer_data_encoded[numerical_features])

In [12]:
similarity_matrix = cosine_similarity(customer_data_encoded.drop(columns=['CustomerID', 'CustomerName', 'SignupDate']))
similarity_df = pd.DataFrame(similarity_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])

In [13]:
lookalike_results = {}
for customer_id in similarity_df.index:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    lookalike_results[customer_id] = list(zip(similar_customers.index, similar_customers.values))

In [14]:
lookalike_data = []
for customer_id, lookalikes in lookalike_results.items():
    for similar_id, score in lookalikes:
        lookalike_data.append([customer_id, similar_id, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'SimilarCustomerID', 'SimilarityScore'])
lookalike_df.to_csv("/mnt/data/Lookalike.csv", index=False)
print("\nLookalike recommendations saved to Lookalike.csv")


Lookalike recommendations saved to Lookalike.csv


In [15]:
merged_data.to_csv("/mnt/data/MergedDataset.csv", index=False)

In [16]:

lookalike_results = {}
for idx in range(20):  
    cust_id = customer_profiles.iloc[idx]['CustomerID']
    lookalikes = get_top_lookalikes(idx, similarity_matrix)
    lookalike_results[cust_id] = lookalikes

lookalike_csv_data = []
for cust_id, lookalikes in lookalike_results.items():
    row = [cust_id]
    for lookalike_id, score in lookalikes:
        row.extend([lookalike_id, score])
    lookalike_csv_data.append(row)

columns = ['CustomerID', 'Lookalike1', 'Score1', 'Lookalike2', 'Score2', 'Lookalike3', 'Score3']
lookalike_df = pd.DataFrame(lookalike_csv_data, columns=columns)


lookalike_df.to_csv('Naresh_Goud_Lookalike.csv', index=False)

print("Naresh_Goud_Lookalike.csv saved successfully!")


NameError: name 'customer_profiles' is not defined

In [17]:

import pandas as pd
from sklearn.preprocessing import MinMaxScaler

customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'], 
    'TransactionID': 'count',
    'ProductID': lambda x: x.nunique()
}).reset_index()


customer_features.columns = ['CustomerID', 'TotalSpending', 'AvgTransactionValue', 'TransactionCount', 'UniqueProducts']

customer_profiles = customers.merge(customer_features, on='CustomerID', how='left')
customer_profiles.fillna(0, inplace=True)

scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(customer_profiles[['TotalSpending', 'AvgTransactionValue', 'TransactionCount', 'UniqueProducts']])
customer_profiles['Features'] = list(scaled_features)

print(customer_profiles.head())


  CustomerID        CustomerName         Region  SignupDate  TotalSpending  \
0      C0001    Lawrence Carroll  South America  2022-07-10        3354.52   
1      C0002      Elizabeth Lutz           Asia  2022-02-13        1862.74   
2      C0003      Michael Rivera  South America  2024-03-07        2725.38   
3      C0004  Kathleen Rodriguez  South America  2022-10-09        5354.88   
4      C0005         Laura Weber           Asia  2022-08-15        2034.24   

   AvgTransactionValue  TransactionCount  UniqueProducts  \
0              670.904               5.0             5.0   
1              465.685               4.0             4.0   
2              681.345               4.0             4.0   
3              669.360               8.0             8.0   
4              678.080               3.0             3.0   

                                            Features  
0  [0.3142740168280109, 0.50705698594246, 0.45454...  
1  [0.17451402349850617, 0.351956215045095, 0.363...  
2  [0

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(scaled_features)

def get_top_lookalikes(customer_index, similarity_matrix, top_n=3):
    similarities = similarity_matrix[customer_index]
    similar_indices = np.argsort(-similarities)[1:top_n+1]  
    return [(customer_profiles.iloc[i]['CustomerID'], similarities[i]) for i in similar_indices]


lookalike_results = {}
for idx in range(20):
    cust_id = customer_profiles.iloc[idx]['CustomerID']
    lookalikes = get_top_lookalikes(idx, similarity_matrix)
    lookalike_results[cust_id] = lookalikes

lookalike_csv_data = []
for cust_id, lookalikes in lookalike_results.items():
    row = [cust_id]
    for lookalike_id, score in lookalikes:
        row.extend([lookalike_id, score])
    lookalike_csv_data.append(row)


columns = ['CustomerID', 'Lookalike1', 'Score1', 'Lookalike2', 'Score2', 'Lookalike3', 'Score3']
lookalike_df = pd.DataFrame(lookalike_csv_data, columns=columns)

lookalike_df.to_csv('Naresh_Jeedi_Lookalike.csv', index=False)

print("Naresh_Jeedi_Lookalike.csv saved successfully!")


Naresh_Jeedi_Lookalike.csv saved successfully!
