In [1]:
import pandas as pd
from utils.regression import RegressionAnalysis
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from utils.rfe import apply_rfe

In [2]:
df = pd.read_csv('../data/cleaned_data_cluster.csv')
# select df without cluster column
df_without_cluster = df.drop(columns=['cluster'])

## Recursive feature elimination - 'Review Rating'

In [3]:
seleselected_features_rating = apply_rfe(df_without_cluster, 'Review Rating')

In [4]:
seleselected_features_rating

['Size',
 'Frequency of Purchases',
 'Age',
 'Purchase Amount (USD)',
 'Previous Purchases',
 'Gender_Male',
 'Category_Clothing',
 'Season_Spring',
 'Season_Summer',
 'Discount Applied_Yes',
 'Review Rating']

## Regression on features selected in RFE - 'Review Rating'

### Prepare datasets

In [5]:
df_0 = df[df['cluster'] == 0]
df_0 = df_0.drop(columns=['cluster'])
df_0_selected_features = df_0[seleselected_features_rating]

df_1 = df[df['cluster'] == 1]
df_1 = df_1.drop(columns=['cluster'])
df_1_selected_features = df_1[seleselected_features_rating]

### Perform regression

In [6]:
# Declare the regression analysis object
r1_df0 = RegressionAnalysis(data = df_0_selected_features, target_column = 'Review Rating')
r1_df1 = RegressionAnalysis(data = df_1_selected_features, target_column = 'Review Rating')

In [7]:
r1_df0.prepare_data()
r1_df0.fit_linear_regression()
r1_df0.tune_ridge_regression()
r1_df0.tune_lasso_regression()
r1_df0.tune_xgboost()
r1_df0.tune_random_forest()
r1_df_0_evaluation = r1_df0.evaluate_models()

In [8]:
r1_df1.prepare_data()
r1_df1.fit_linear_regression()
r1_df1.tune_ridge_regression()
r1_df1.tune_lasso_regression()
r1_df1.tune_xgboost()
r1_df1.tune_random_forest()
r1_df_1_evaluation = r1_df1.evaluate_models()

### Models evaluation

In [9]:
r1_df_0_evaluation.sort_values(by='R2', ascending=False)

Unnamed: 0,Model,RMSE,MAPE,R2
1,ridge,0.286592,3082892000000.0,-0.01223
2,lasso,0.286878,3252254000000.0,-0.014253
0,linear,0.286917,2976683000000.0,-0.014529
3,xgboost,0.287297,3152886000000.0,-0.017221
4,random_forest,0.294252,3115572000000.0,-0.067069


In [10]:
r1_df_1_evaluation.sort_values(by='R2', ascending=False)

Unnamed: 0,Model,RMSE,MAPE,R2
3,xgboost,0.280828,41031810000000.0,-0.000416
2,lasso,0.281128,40773200000000.0,-0.002554
1,ridge,0.2817,41301400000000.0,-0.006637
0,linear,0.282319,41363980000000.0,-0.011069
4,random_forest,0.287313,40249040000000.0,-0.047155


### Check coefficients

In [11]:
r1_df_0_coefficients = r1_df0.get_coefficients(model_name='xgboost')
r1_df_0_coefficients.sort_values(by='Importance', ascending=False)

Unnamed: 0,Importance
Season_Spring,0.148326
Purchase Amount (USD),0.133833
Size,0.131832
Previous Purchases,0.120794
Gender_Male,0.114983
Age,0.108035
Frequency of Purchases,0.096419
Discount Applied_Yes,0.074251
Season_Summer,0.071528
Category_Clothing,0.0


In [12]:
r1_df_1_coefficients = r1_df1.get_coefficients(model_name='xgboost')
r1_df_1_coefficients.sort_values(by='Importance', ascending=False)

Unnamed: 0,Importance
Size,0.239664
Age,0.131583
Purchase Amount (USD),0.118664
Season_Spring,0.117237
Season_Summer,0.110827
Previous Purchases,0.099604
Category_Clothing,0.092562
Frequency of Purchases,0.089858
Gender_Male,0.0
Discount Applied_Yes,0.0


## Recursive feature elimination - 'Frequency of Purchases'

In [13]:
selecelected_features_frequency = apply_rfe(df_without_cluster, 'Frequency of Purchases')

In [14]:
selecelected_features_frequency

['Size',
 'Age',
 'Purchase Amount (USD)',
 'Review Rating',
 'Previous Purchases',
 'Gender_Male',
 'Category_Clothing',
 'Season_Spring',
 'Shipping Type_Standard',
 'Discount Applied_Yes',
 'Frequency of Purchases']

## Regression on features selected in RFE 'Frequency od Purchases'

### Prepare datasets

In [15]:
df_0_r2 = df[df['cluster'] == 0]
df_0_r2 = df_0_r2.drop(columns=['cluster'])
df_0_r2_selected_features = df_0_r2[selecelected_features_frequency]

df_1_r2 = df[df['cluster'] == 1]
df_1_r2 = df_1_r2.drop(columns=['cluster'])
df_1_r2_selected_features = df_1_r2[selecelected_features_frequency]

### Perform regression

In [16]:
# Declare the regression analysis object
r2_df0 = RegressionAnalysis(data = df_0_selected_features, target_column = 'Review Rating')
r2_df1 = RegressionAnalysis(data = df_1_selected_features, target_column = 'Review Rating')

In [17]:
r2_df0.prepare_data()
r2_df0.fit_linear_regression()
r2_df0.tune_ridge_regression()
r2_df0.tune_lasso_regression()
r2_df0.tune_xgboost()
r2_df0.tune_random_forest()
r2_df_0_evaluation = r2_df0.evaluate_models()

In [18]:
r2_df1.prepare_data()
r2_df1.fit_linear_regression()
r2_df1.tune_ridge_regression()
r2_df1.tune_lasso_regression()
r2_df1.tune_xgboost()
r2_df1.tune_random_forest()
r2_df_1_evaluation = r2_df1.evaluate_models()

### Models evaluation

In [19]:
r2_df_0_evaluation.sort_values(by='R2', ascending=False)

Unnamed: 0,Model,RMSE,MAPE,R2
1,ridge,0.286592,3082892000000.0,-0.01223
2,lasso,0.286878,3252254000000.0,-0.014253
0,linear,0.286917,2976683000000.0,-0.014529
3,xgboost,0.287297,3152886000000.0,-0.017221
4,random_forest,0.294252,3115572000000.0,-0.067069


In [20]:
r2_df_1_evaluation.sort_values(by='R2', ascending=False)

Unnamed: 0,Model,RMSE,MAPE,R2
3,xgboost,0.280828,41031810000000.0,-0.000416
2,lasso,0.281128,40773200000000.0,-0.002554
1,ridge,0.2817,41301400000000.0,-0.006637
0,linear,0.282319,41363980000000.0,-0.011069
4,random_forest,0.287313,40249040000000.0,-0.047155


### Check coefficients

In [21]:
r2_df_0_coefficients = r2_df0.get_coefficients(model_name='xgboost')
r2_df_0_coefficients.sort_values(by='Importance', ascending=False)

Unnamed: 0,Importance
Season_Spring,0.148326
Purchase Amount (USD),0.133833
Size,0.131832
Previous Purchases,0.120794
Gender_Male,0.114983
Age,0.108035
Frequency of Purchases,0.096419
Discount Applied_Yes,0.074251
Season_Summer,0.071528
Category_Clothing,0.0


In [22]:
r2_df_1_coefficients = r2_df1.get_coefficients(model_name='xgboost')
r2_df_1_coefficients.sort_values(by='Importance', ascending=False)

Unnamed: 0,Importance
Size,0.239664
Age,0.131583
Purchase Amount (USD),0.118664
Season_Spring,0.117237
Season_Summer,0.110827
Previous Purchases,0.099604
Category_Clothing,0.092562
Frequency of Purchases,0.089858
Gender_Male,0.0
Discount Applied_Yes,0.0
