In [1]:
import pandas as pd
import numpy as np
import pickle
import os

# Import necessary libraries for data processing and model evaluation (ML)

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
# Read the cleaned data

# Assuming the cleaned data is in a CSV file

data = pd.read_csv('../data/cleaned_data.csv')
data

Unnamed: 0,year,month,day,order,country,session_id,page1_main_category,page2_clothing_model,colour,location,...,price,price_2,page,total_clicks,avg_price,unique_products,browsing_depth,weekday,weekend,high_price_preference
0,2008,6,22,21,29,15648,3,C20,13,1,...,48,1,2,84,46.928571,55,4,6,1,1
1,2008,5,19,6,29,10018,2,B26,13,3,...,57,1,2,9,57.666667,4,2,0,0,1
2,2008,7,15,2,29,19388,3,C13,9,5,...,48,1,1,10,38.900000,9,3,1,0,1
3,2008,5,2,2,29,7181,2,B11,2,4,...,43,2,1,6,51.666667,5,2,4,0,0
4,2008,6,9,16,29,13493,2,B31,9,5,...,57,1,2,15,52.333333,12,2,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132374,2008,7,4,3,29,17622,4,P19,2,1,...,48,1,2,5,44.000000,5,5,4,0,1
132375,2008,6,19,9,29,15165,3,C26,14,3,...,28,2,2,33,40.818182,31,5,3,0,0
132376,2008,7,15,4,29,19359,1,A4,3,2,...,38,2,1,8,53.250000,8,1,1,0,0
132377,2008,7,28,16,29,21454,3,C50,9,5,...,20,2,3,18,43.500000,18,4,0,0,0


In [3]:
drop_columns = ['year', 'session_id', 'page2_clothing_model']  # Drop columns that are not needed for the model

# Drop unnecessary columns

data.drop(columns=drop_columns, errors='ignore', inplace=True)

In [4]:
corr = data.corr() # Calculate the correlation matrix
corr

Unnamed: 0,month,day,order,country,page1_main_category,colour,location,model_photography,price,price_2,page,total_clicks,avg_price,unique_products,browsing_depth,weekday,weekend,high_price_preference
month,1.0,-0.072782,0.025934,0.064741,0.01554,-0.03429,-0.016841,0.010027,0.012838,-0.01262,0.021586,0.030659,0.026027,0.052018,0.047197,0.007097,0.016774,0.001603
day,-0.072782,1.0,-0.020046,0.012805,-0.002359,0.006454,-0.000348,-0.000466,-0.001474,0.004514,0.011297,-0.026783,-0.002988,-0.023631,0.005316,-0.000411,0.020367,-0.000344
order,0.025934,-0.020046,1.0,-0.064632,0.150397,0.04598,0.014626,0.058337,-0.064713,0.000752,0.203583,0.790185,-0.074438,0.768191,0.337144,0.023876,0.018803,-0.042149
country,0.064741,0.012805,-0.064632,1.0,0.135064,0.01074,-0.007948,0.02869,-0.045595,-0.018437,0.027037,-0.081657,-0.092438,-0.059349,0.038653,0.022451,0.008422,-0.037973
page1_main_category,0.01554,-0.002359,0.150397,0.135064,1.0,0.231335,0.016009,0.21507,-0.363875,-0.139646,0.348498,0.082294,-0.338717,0.098411,0.288409,0.01341,0.008467,-0.188974
colour,-0.03429,0.006454,0.04598,0.01074,0.231335,1.0,0.041225,0.075038,-0.090333,-0.080437,0.250075,0.009721,-0.07414,0.01387,0.082665,0.001184,0.00291,0.101356
location,-0.016841,-0.000348,0.014626,-0.007948,0.016009,0.041225,1.0,0.066082,-0.083217,0.071155,-0.092601,-0.001209,-0.033406,-0.000627,-0.029565,-0.001788,-0.003041,-0.041891
model_photography,0.010027,-0.000466,0.058337,0.02869,0.21507,0.075038,0.066082,1.0,-0.214295,0.073775,0.260689,0.025399,-0.156236,0.033523,0.146381,-0.001319,-0.003131,-0.144506
price,0.012838,-0.001474,-0.064713,-0.045595,-0.363875,-0.090333,-0.083217,-0.214295,1.0,-0.743123,-0.149675,-0.044293,0.493255,-0.050307,-0.128663,-0.003135,8e-06,0.832856
price_2,-0.01262,0.004514,0.000752,-0.018437,-0.139646,-0.080437,0.071155,0.073775,-0.743123,1.0,0.029368,0.008804,-0.271765,0.006161,0.002055,-0.004148,-0.003991,-0.750995


In [5]:
pair_corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).stack().reset_index() # Get the upper triangle of the correlation matrix
corr65 = pair_corr[abs(pair_corr[0]) > 0.65] # Filter pairs with correlation greater than 0.65

corr65 = corr65.sort_values(by=0, ascending=False) # Sort by correlation value

corr65.columns = ['Primary', 'Secondary', 'Score'] # Rename columns for clarity
corr65

Unnamed: 0,Primary,Secondary,Score
133,total_clicks,unique_products,0.972211
116,price,high_price_preference,0.832856
41,order,total_clicks,0.790185
43,order,unique_products,0.768191
150,weekday,weekend,0.76604
108,price,price_2,-0.743123
124,price_2,high_price_preference,-0.750995


In [6]:
# Save the correlation pairs to a CSV file for further analysis

groups = corr65.groupby(['Primary']).agg({'Secondary': 'count'}).sort_values('Secondary', ascending=False).index
columns_to_drop = list(groups)
columns_to_drop 

['order', 'price', 'price_2', 'total_clicks', 'weekday']

In [7]:
data.drop(columns=columns_to_drop, axis=1, errors='ignore', inplace=True) # Drop the columns that are highly correlated with others
numeric_features = ['browsing_depth', 'avg_price', 'unique_products', 'weekend']
categorical_features = ['page1_main_category', 'colour', 'location', 'model_photography', 'page', 'country']

In [8]:
numeric_transformer = StandardScaler() # Standardize numeric features

# Create a transformer for categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [9]:
X = data.drop(columns=['high_price_preference'], errors='ignore')  # Exclude target variable
preprocessor.fit(X)

In [10]:
processed_csv_path = '../data'
os.makedirs(processed_csv_path, exist_ok=True)
processed_csv_file = os.path.join(processed_csv_path, "processed_data.csv")
data.to_csv(processed_csv_file, index=False)
print(f"Processed data saved successfully at '{processed_csv_file}'")
preprocessed_data_path = '../models'
os.makedirs(preprocessed_data_path, exist_ok=True)
data_file = os.path.join(preprocessed_data_path, "preprocessed_data.pkl")

Processed data saved successfully at '../data\processed_data.csv'


In [11]:
with open(data_file, "wb") as file:
    pickle.dump(preprocessor, file)
print(f"Preprocessing model saved successfully at '{data_file}'")

Preprocessing model saved successfully at '../models\preprocessed_data.pkl'
