##  importing libraraies

In [2]:
import pandas as pd
import numpy as np

## Loading datasets

In [24]:
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')
df=products_df
unique_categories = df['Category'].unique()
print(unique_categories)

['Books' 'Electronics' 'Home Decor' 'Clothing']


In [25]:
merged_df = pd.merge(transactions_df, customers_df, on='CustomerID', how='left')
merged_df = pd.merge(merged_df, products_df, on='ProductID', how='left')

In [26]:
merged_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


## Feature Engineering

In [31]:
# Extract features like total spending, purchase frequency, and product categories
customer_features = merged_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spend per customer
    'TransactionDate': ['min', 'max', 'count'],  # Recency and frequency of transactions
    'Category': lambda x: ', '.join(x.unique())  # Product categories bought
}).reset_index()

# Flatten the column headers after aggregation
customer_features.columns = ['CustomerID', 'TotalSpend', 'FirstPurchase', 'LastPurchase', 'PurchaseCount', 'Category']

# Add a recency feature (difference between the most recent and first purchase)
customer_features['Recency'] = (pd.to_datetime('today') - pd.to_datetime(customer_features['LastPurchase'])).dt.days
customer_features.head()


Unnamed: 0,CustomerID,TotalSpend,FirstPurchase,LastPurchase,PurchaseCount,Category,Recency
0,C0001,3354.52,2024-01-19 03:12:55,2024-11-02 17:04:16,5,"Books, Home Decor, Electronics",82
1,C0002,1862.74,2024-02-28 07:44:21,2024-12-03 01:41:41,4,"Home Decor, Clothing",51
2,C0003,2725.38,2024-02-18 02:50:37,2024-08-24 18:54:04,4,"Home Decor, Clothing, Electronics",152
3,C0004,5354.88,2024-02-28 10:16:35,2024-12-23 14:13:52,8,"Books, Home Decor, Electronics",31
4,C0005,2034.24,2024-03-15 04:08:59,2024-11-04 00:30:22,3,"Home Decor, Electronics",80


In [34]:
df = customer_features

# Create a mapping of unique categories to integers
category_mapping = {Category: idx for idx, Category in enumerate(df['Category'].unique())}

# Map the categories to integer values
df['Category'] = df['Category'].map(category_mapping)
df.head(10)

Unnamed: 0,CustomerID,TotalSpend,FirstPurchase,LastPurchase,PurchaseCount,Category,Recency
0,C0001,3354.52,2024-01-19 03:12:55,2024-11-02 17:04:16,5,0,82
1,C0002,1862.74,2024-02-28 07:44:21,2024-12-03 01:41:41,4,1,51
2,C0003,2725.38,2024-02-18 02:50:37,2024-08-24 18:54:04,4,2,152
3,C0004,5354.88,2024-02-28 10:16:35,2024-12-23 14:13:52,8,0,31
4,C0005,2034.24,2024-03-15 04:08:59,2024-11-04 00:30:22,3,3,80
5,C0006,4227.57,2024-01-25 09:29:44,2024-10-07 04:07:35,4,4,108
6,C0007,2579.82,2024-02-20 09:22:52,2024-08-25 08:05:44,3,5,151
7,C0008,4271.61,2024-01-22 19:40:43,2024-12-17 04:05:00,10,6,37
8,C0009,896.5,2024-03-16 17:26:03,2024-10-12 06:41:00,3,7,103
9,C0010,1717.55,2024-02-22 18:44:05,2024-11-16 18:14:23,4,8,68


In [35]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Normalize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[['TotalSpend', 'PurchaseCount', 'Recency','Category']])

# Compute Cosine Similarity
similarity_matrix = cosine_similarity(scaled_features)

# Convert the similarity matrix to a DataFrame for easier interpretation
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])
similarity_df.head()


CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,0.82963,0.733849,0.63393,0.75344,0.847021,0.603447,0.439055,0.540976,0.70967,...,0.623291,0.278662,-0.63908,-0.763293,-0.816423,0.020349,0.714788,-0.178509,0.800846,-0.744075
C0002,0.82963,1.0,0.634334,0.259953,0.939968,0.579975,0.620043,0.138148,0.841848,0.975859,...,0.801057,0.338134,-0.597578,-0.784055,-0.691837,-0.016235,0.961344,0.320928,0.959142,-0.631266
C0003,0.733849,0.634334,1.0,0.043618,0.765262,0.776794,0.95636,-0.100365,0.740555,0.62898,...,0.078118,0.841251,0.039473,-0.871827,-0.990502,-0.414627,0.505603,0.236038,0.801561,-0.94899
C0004,0.63393,0.259953,0.043618,1.0,0.016689,0.434179,-0.187358,0.904644,-0.259467,0.061227,...,0.501417,-0.436107,-0.788261,-0.035801,-0.159944,0.294755,0.147991,-0.824566,0.097129,-0.156823
C0005,0.75344,0.939968,0.765262,0.016689,1.0,0.663282,0.81796,-0.166914,0.925985,0.946213,...,0.595236,0.540767,-0.349475,-0.924797,-0.795509,-0.039681,0.933924,0.505181,0.970369,-0.677712


In [40]:
import pandas as pd

# Create the dictionary to store lookalikes and their scores
lookalike_dict = {}

for customer_id in customer_features['CustomerID'][:20]:  # Customers C0001 to C0020
    # Get similarity scores and sort them in descending order
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]  # Exclude self
    lookalike_dict[customer_id] = []
    
    # Prepare lookalike list with customer ID and similarity scores
    for idx, score in zip(similar_customers.index, similar_customers.values):
        lookalike_dict[customer_id].append([idx, score])
    
# Flatten the lookalike dictionary into a structured format
lookalike_data = []
for customer_id, lookalikes in lookalike_dict.items():
    # Flatten the list of lookalikes into individual rows with their similarity scores
    for i, (lookalike_id, score) in enumerate(lookalikes):
        lookalike_data.append([customer_id, lookalike_id, score])

# Convert to DataFrame
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeID', 'SimilarityScore'])

# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the first few rows
lookalike_df.head()


Unnamed: 0,CustomerID,LookalikeID,SimilarityScore
0,C0001,C0016,0.906555
1,C0001,C0008,0.896547
2,C0001,C0013,0.863856
3,C0002,C0012,0.872651
4,C0002,C0015,0.848944


In [37]:
lookalike_df.to_csv('dLookalike.csv', index_label='CustomerID')


  CustomerID LookalikeID  SimilarityScore
0      C0001       C0016         0.906555
1      C0001       C0008         0.896547
2      C0001       C0013         0.863856
3      C0002       C0012         0.872651
4      C0002       C0015         0.848944


In [13]:
import pandas as pd

# Create the dictionary to store lookalikes and their scores
lookalike_dict = {}

for customer_id in customer_features['CustomerID'][:20]:  # Customers C0001 to C0020
    # Get similarity scores and sort them in descending order
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]  # Exclude self
    lookalike_dict[customer_id] = []
    
    # Prepare lookalike list with customer ID and similarity scores
    for idx, score in zip(similar_customers.index, similar_customers.values):
        lookalike_dict[customer_id].append([idx, score])
    
# Prepare the data for the desired format
lookalike_data = []

for customer_id, lookalikes in lookalike_dict.items():
    row = [customer_id]  # Start with the customer_id
    for lookalike in lookalikes:
        row.extend(lookalike)  # Add the lookalike_id and similarity_score
    lookalike_data.append(row)

# Convert to DataFrame with appropriate column names
columns = ['CustomerID', 'Similar1_ID', 'Similar1_Score', 'Similar2_ID', 'Similar2_Score', 'Similar3_ID', 'Similar3_Score']
lookalike_df = pd.DataFrame(lookalike_data, columns=columns)

# Save to CSV
lookalike_df.to_csv('dummylike.csv', index=False)



In [47]:
import pandas as pd

# Create the dictionary to store lookalikes and their scores
lookalike_dict = {}

for customer_id in customer_features['CustomerID'][:20]:  # Customers C0001 to C0020
    # Get similarity scores and sort them in descending order
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]  # Exclude self
    lookalike_dict[customer_id] = []
    
    # Prepare lookalike list with customer ID and similarity scores
    for idx, score in zip(similar_customers.index, similar_customers.values):
        lookalike_dict[customer_id].append([idx, score])
    
# Prepare the data for the desired format
lookalike_data = []

for customer_id, lookalikes in lookalike_dict.items():
    row = [customer_id]  # Start with the customer_id
    for lookalike in lookalikes:
        row.append(f"{lookalike[0]}, {lookalike[1]}")  # Merge ID and Score with a comma
    lookalike_data.append(row)

# Convert to DataFrame with appropriate column names
columns = ['CustomerID', 'Similar1', 'Similar2', 'Similar3']
lookalike_df = pd.DataFrame(lookalike_data, columns=columns)

# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the first few rows
print(lookalike_df)


   CustomerID                   Similar1                   Similar2  \
0       C0001   C0016, 0.906555499221179   C0008, 0.896546595851063   
1       C0002  C0012, 0.8726506554473953  C0015, 0.8489435553129182   
2       C0003  C0020, 0.9518744768327362  C0006, 0.7351940221225949   
3       C0004  C0014, 0.9295293167921905  C0008, 0.8919233550156721   
4       C0005  C0010, 0.9342139979247938  C0019, 0.8820414102298896   
5       C0006  C0020, 0.9088437184127384   C0009, 0.855803342392611   
6       C0007  C0004, 0.8209932298479351  C0020, 0.8155238187685688   
7       C0008  C0008, 0.9194826137446735  C0001, 0.8917730007820798   
8       C0009  C0012, 0.9473705904889242  C0019, 0.9040443929009577   
9       C0010  C0018, 0.9594333408334251   C0006, 0.952749011516985   
10      C0011  C0014, 0.9774951397444468  C0004, 0.9764594650133958   
11      C0012  C0014, 0.8765052453165908  C0008, 0.8681260573682142   
12      C0013  C0004, 0.9767610881903371  C0015, 0.8221177331942455   
13    