In [None]:
import pandas as pd

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


In [None]:
# Merge transactions with customer information
transactions_customers = transactions.merge(customers, on="CustomerID", how="inner")


In [None]:
# Merge the result with product information
merged_data = transactions_customers.merge(products, on="ProductID", how="inner")


In [None]:
print(merged_data.info())
print(merged_data.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   Price_x          1000 non-null   float64
 7   CustomerName     1000 non-null   object 
 8   Region           1000 non-null   object 
 9   SignupDate       1000 non-null   object 
 10  ProductName      1000 non-null   object 
 11  Category         1000 non-null   object 
 12  Price_y          1000 non-null   float64
dtypes: float64(3), int64(1), object(9)
memory usage: 101.7+ KB
None
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23       

In [None]:
customer_features = merged_data.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_features.rename(columns={'TotalValue': 'TotalSpending'}, inplace=True)


In [None]:
customer_features['AverageSpending'] = merged_data.groupby('CustomerID')['TotalValue'].mean().values


In [None]:
customer_features['TotalQuantity'] = merged_data.groupby('CustomerID')['Quantity'].sum().values


In [None]:
customer_features['AverageQuantity'] = merged_data.groupby('CustomerID')['Quantity'].mean().values


In [None]:
favorite_category = merged_data.groupby(['CustomerID', 'Category']).size().reset_index(name='Count')
favorite_category = favorite_category.loc[favorite_category.groupby('CustomerID')['Count'].idxmax()]
customer_features = customer_features.merge(favorite_category[['CustomerID', 'Category']], on='CustomerID', how='left')
customer_features.rename(columns={'Category': 'FavoriteCategory'}, inplace=True)


In [None]:
unique_products = merged_data.groupby('CustomerID')['ProductID'].nunique().reset_index()
unique_products.rename(columns={'ProductID': 'UniqueProductsPurchased'}, inplace=True)
customer_features = customer_features.merge(unique_products, on='CustomerID', how='left')


In [None]:
region_info = merged_data[['CustomerID', 'Region']].drop_duplicates()
customer_features = customer_features.merge(region_info, on='CustomerID', how='left')


In [None]:
num_transactions = merged_data.groupby('CustomerID')['TransactionID'].nunique().reset_index()
num_transactions.rename(columns={'TransactionID': 'NumberOfTransactions'}, inplace=True)
customer_features = customer_features.merge(num_transactions, on='CustomerID', how='left')


In [None]:
customer_features = pd.get_dummies(customer_features, columns=['Region', 'FavoriteCategory'], drop_first=True)


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_columns = ['TotalSpending', 'AverageSpending', 'TotalQuantity', 'AverageQuantity', 'UniqueProductsPurchased', 'NumberOfTransactions']
customer_features[numerical_columns] = scaler.fit_transform(customer_features[numerical_columns])


In [None]:
# Display the scaled dataset
print("Scaled Dataset:")
print(customer_features)


Scaled Dataset:
    CustomerID  TotalSpending  AverageSpending  TotalQuantity  \
0        C0001      -0.061701        -0.070263      -0.122033   
1        C0002      -0.877744        -0.934933      -0.448000   
2        C0003      -0.405857        -0.026271       0.203934   
3        C0004       1.032547        -0.076769       1.670787   
4        C0005      -0.783929        -0.040028      -0.936951   
..         ...            ...              ...            ...   
194      C0196       0.829053         2.351666      -0.122033   
195      C0197      -0.841689        -0.188326      -0.610984   
196      C0198      -1.386975        -0.933964      -1.588886   
197      C0199      -0.813993        -0.812176      -0.610984   
198      C0200       0.706367         1.112926       0.529902   

     AverageQuantity  UniqueProductsPurchased  NumberOfTransactions  \
0          -0.233464                 0.050047             -0.011458   
1          -0.054969                -0.424204             -0.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd


In [None]:
# Extract features for similarity computation
features = customer_features.drop(columns=['CustomerID'])


In [None]:
# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(features)

# Convert the similarity matrix into a DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


In [None]:
def get_top_similar_customers(customer_id, similarity_df, top_n=3):
    # Get similarity scores for the given customer
    similar_scores = similarity_df[customer_id].sort_values(ascending=False)

    # Exclude the customer itself (highest similarity = 1)
    similar_scores = similar_scores.drop(customer_id)

    # Get the top N similar customers
    top_similar = similar_scores.head(top_n)
    return top_similar


In [None]:
top_similar = get_top_similar_customers('C0001', similarity_df)
print("Top 3 similar customers for C0001:")
print(top_similar)


Top 3 similar customers for C0001:
CustomerID
C0048    0.943015
C0190    0.838230
C0181    0.825221
Name: C0001, dtype: float64


In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity
features = customer_features.drop(columns=['CustomerID'])
similarity_df = pd.DataFrame(
    cosine_similarity(features),
    index=customer_features['CustomerID'],
    columns=customer_features['CustomerID']
)

# Generate top 3 similar customers for the first 20 customers
recommendations = {
    customer_id: [
        (similar_customer, score)
        for similar_customer, score in similarity_df[customer_id].sort_values(ascending=False)[1:4].items()
    ]
    for customer_id in customer_features['CustomerID'][:20]
}

# Convert recommendations to a DataFrame and save to CSV
recommendations_df = pd.DataFrame([
    {'CustomerID': customer_id, 'TopSimilarCustomers': recommendations[customer_id]}
    for customer_id in recommendations
])
recommendations_df.to_csv("Lookalike_Recommendations.csv", index=False)

# Display the resulting DataFrame
print(recommendations_df)


   CustomerID                                TopSimilarCustomers
0       C0001  [(C0048, 0.9430145596375978), (C0190, 0.838230...
1       C0002  [(C0077, 0.9133133542758101), (C0029, 0.880803...
2       C0003  [(C0151, 0.7370360855695157), (C0027, 0.724940...
3       C0004  [(C0175, 0.9216310250055183), (C0113, 0.917089...
4       C0005  [(C0186, 0.9823633130729911), (C0130, 0.934945...
5       C0006  [(C0168, 0.9735712885666749), (C0171, 0.900887...
6       C0007  [(C0140, 0.9752242389376314), (C0115, 0.939306...
7       C0008  [(C0090, 0.9306165868681694), (C0194, 0.882565...
8       C0009  [(C0198, 0.9538870167320774), (C0083, 0.928757...
9       C0010  [(C0111, 0.9094167940467867), (C0062, 0.828612...
10      C0011  [(C0153, 0.7380936228464436), (C0187, 0.712365...
11      C0012  [(C0104, 0.9474047926849681), (C0113, 0.936041...
12      C0013  [(C0099, 0.9833800109326489), (C0108, 0.909613...
13      C0014  [(C0060, 0.9814150450435234), (C0128, 0.932274...
14      C0015  [(C0131, 0