In [51]:
import pandas as pd

customers_df = pd.read_csv('/content/Updated_Customers.csv')
products_df = pd.read_csv('/content/Products - Products.csv')
transactions_df = pd.read_csv('/content/Transactions - Transactions.csv')

customer_transactions_df = pd.merge(transactions_df, customers_df, on='CustomerID', how='left')

full_data_df = pd.merge(customer_transactions_df, products_df, on='ProductID', how='left')

full_data_df['TransactionDate'] = pd.to_datetime(full_data_df['TransactionDate'], errors='coerce')
print("Missing Values Before Cleaning:")
print(full_data_df.isnull().sum())

full_data_df.drop(columns=['Price_x'], inplace=True)
full_data_df.rename(columns={'Price_y': 'Price'}, inplace=True)

full_data_df.dropna(inplace=True)

full_data_df['Price'] = pd.to_numeric(full_data_df['Price'], errors='coerce')
full_data_df['Quantity'] = pd.to_numeric(full_data_df['Quantity'], errors='coerce')
full_data_df['TotalValue'] = pd.to_numeric(full_data_df['TotalValue'], errors='coerce')

print("Data Types After Cleaning:")
print(full_data_df.dtypes)
print("Missing Values After Cleaning:")
print(full_data_df.isnull().sum())
full_data_df.to_csv('Cleaned_Merged_Customer_Product_Transactions.csv', index=False)

print("Cleaned Data Sample:")
print(full_data_df.head())


Missing Values Before Cleaning:
TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price_x            0
CustomerName       0
Region             0
SignupDate         0
ProductName        0
Category           0
Price_y            0
dtype: int64
Data Types After Cleaning:
TransactionID              object
CustomerID                 object
ProductID                  object
TransactionDate    datetime64[ns]
Quantity                    int64
TotalValue                float64
CustomerName               object
Region                     object
SignupDate                 object
ProductName                object
Category                   object
Price                     float64
dtype: object
Missing Values After Cleaning:
TransactionID        0
CustomerID           0
ProductID            0
TransactionDate      0
Quantity             0
TotalValue         257
CustomerName         0
Region               0
SignupDate        

  full_data_df['TransactionDate'] = pd.to_datetime(full_data_df['TransactionDate'], errors='coerce')  # Coerce errors to NaT


In [53]:
import pandas as pd
import numpy as np

full_data_df = pd.read_csv('/content/Cleaned_Merged_Customer_Product_Transactions.csv')

print(full_data_df.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue     CustomerName         Region  SignupDate  \
0      300.68   Andrea Jenkins         Europe  03/12/2022   
1      300.68  Brittany Harvey           Asia  04/09/2024   
2      300.68  Kathryn Stevens         Europe  04/04/2024   
3      601.36  Travis Campbell  South America  11/04/2024   
4      902.04    Timothy Perez         Europe  15/03/2022   

                       ProductName     Category   Price  
0  ComfortLiving Bluetooth Speaker  Electronics  300.68  
1  ComfortLiving Bluetooth Speaker  Electronics  300.68  
2  ComfortLiving Bluetooth Speak

In [73]:
"""For Predication """
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

full_data_df = pd.read_csv('/content/Cleaned_Merged_Customer_Product_Transactions.csv')

customer_features = full_data_df.groupby('CustomerID').agg({
    'Region': 'first',
    'SignupDate': 'first',
    'Quantity': 'sum',
    'TotalValue': 'sum',
    'Price': 'mean'
}).reset_index()

customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)
scaler = StandardScaler()
numeric_cols = ['Quantity', 'TotalValue', 'Price']
customer_features[numeric_cols] = scaler.fit_transform(customer_features[numeric_cols])

similarity_matrix = cosine_similarity(customer_features[numeric_cols + list(customer_features.columns[2:])])

def get_top_3_lookalikes(customer_id, customer_features, similarity_matrix):
    customer_index = customer_features[customer_features['CustomerID'] == customer_id].index[0]
    similarities = similarity_matrix[customer_index]
    similar_customers = np.argsort(similarities)[::-1][1:4]
    similar_customer_ids = customer_features['CustomerID'].iloc[similar_customers].values
    scores = similarities[similar_customers]
    return list(zip(similar_customer_ids, scores))

customer_id_input = input("Enter CustomerID (e.g., C0001): ")
top_3_lookalikes = get_top_3_lookalikes(customer_id_input, customer_features, similarity_matrix)

print(f"Top 3 Lookalikes for Customer {customer_id_input}:")
for i, (lookalike, score) in enumerate(top_3_lookalikes, 1):
    print(f"Lookalike {i}: {lookalike}, Similarity Score: {score}")

actual_lookalikes = [
    ('C0076', 0.9446), ('C0011', 0.9432), ('C0137', 0.9298)
]

def calculate_mse(actual_lookalikes, predicted_lookalikes):
    actual_scores = [score for _, score in actual_lookalikes]
    predicted_scores = [score for _, score in predicted_lookalikes]

    mse = mean_squared_error(actual_scores, predicted_scores)
    return mse

mse = calculate_mse(actual_lookalikes, top_3_lookalikes)
print(f"Mean Squared Error (MSE) between actual and predicted lookalikes: {mse}")



Enter CustomerID (e.g., C0001): C0199
Top 3 Lookalikes for Customer C0199:
Lookalike 1: C0073, Similarity Score: 0.9850059276267502
Lookalike 2: C0132, Similarity Score: 0.974943257219227
Lookalike 3: C0019, Similarity Score: 0.9196573480364263
Mean Squared Error (MSE) between actual and predicted lookalikes: 0.0009143822517061226


In [68]:
""" For Look a like csv"""
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

full_data_df = pd.read_csv('/content/Cleaned_Merged_Customer_Product_Transactions.csv')
customer_features = full_data_df.groupby('CustomerID').agg({
    'Region': 'first',
    'SignupDate': 'first',
    'Quantity': 'sum',
    'TotalValue': 'sum',
    'Price': 'mean'
}).reset_index()

customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

scaler = StandardScaler()
numeric_cols = ['Quantity', 'TotalValue', 'Price']
customer_features[numeric_cols] = scaler.fit_transform(customer_features[numeric_cols])

similarity_matrix = cosine_similarity(customer_features[numeric_cols + list(customer_features.columns[2:])])

top_3_lookalikes = {}

for i, customer_id in enumerate(customer_features['CustomerID'].head(20)):
    similarities = similarity_matrix[i]
    similar_customers = np.argsort(similarities)[::-1][1:4]
    similar_customer_ids = customer_features['CustomerID'].iloc[similar_customers].values
    scores = similarities[similar_customers]
    top_3_lookalikes[customer_id] = list(zip(similar_customer_ids, scores))


lookalike_data = []

for customer_id, lookalikes in top_3_lookalikes.items():
    lookalike_data.append([customer_id] + [f"{lookalike[0]}:{lookalike[1]:.4f}" for lookalike in lookalikes])
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'Lookalike_1', 'Lookalike_2', 'Lookalike_3'])
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Top 3 Lookalikes for First 20 Customers:")
print(lookalike_df.head())


Top 3 Lookalikes for First 20 Customers:
  CustomerID   Lookalike_1   Lookalike_2   Lookalike_3
0      C0001  C0076:0.9446  C0011:0.9432  C0137:0.9298
1      C0002  C0025:0.8750  C0157:0.8581  C0121:0.8524
2      C0003  C0190:0.9546  C0091:0.9086  C0174:0.9045
3      C0004  C0175:0.9434  C0109:0.9369  C0101:0.9319
4      C0005  C0186:0.9127  C0103:0.8282  C0131:0.8167
