In [4]:
import pandas as pd
from utils.regression import RegressionAnalysis
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from utils.rfe import apply_rfe

In [5]:
df = pd.read_csv('../data/cleaned_data_cluster.csv')
# select df without cluster column
df_without_cluster = df.drop(columns=['cluster'])

## Recursive feature elimination - 'Review Rating'

In [6]:
selecelected_features_rating = apply_rfe(df_without_cluster, 'Review Rating')

In [7]:
selecelected_features_rating

['Size',
 'Frequency of Purchases',
 'Age',
 'Purchase Amount (USD)',
 'Previous Purchases',
 'Gender_Male',
 'Category_Clothing',
 'Season_Spring',
 'Season_Winter',
 'Discount Applied_Yes',
 'Review Rating']

## Regression on features selected in RFE - 'Review Rating'

### Prepare datasets

In [8]:
df_0 = df[df['cluster'] == 0]
df_0 = df_0.drop(columns=['cluster'])
df_0_selected_features = df_0[selecelected_features_rating]

df_1 = df[df['cluster'] == 1]
df_1 = df_1.drop(columns=['cluster'])
df_1_selected_features = df_1[selecelected_features_rating]

### Perform regression

In [9]:
# Declare the regression analysis object
r1_df0 = RegressionAnalysis(data = df_0_selected_features, target_column = 'Review Rating')
r1_df1 = RegressionAnalysis(data = df_1_selected_features, target_column = 'Review Rating')

In [10]:
r1_df0.prepare_data()
#r1_df0.fit_linear_regression()
#r1_df0.tune_ridge_regression()
#r1_df0.tune_lasso_regression()
r1_df0.tune_xgboost()
#r1_df0.tune_random_forest()
r1_df_0_evaluation = r1_df0.evaluate_models()

In [14]:
r1_df1.prepare_data()
#r1_df1.fit_linear_regression()
#r1_df1.tune_ridge_regression()
#r1_df1.tune_lasso_regression()
r1_df1.tune_xgboost()
#r1_df1.tune_random_forest()
r1_df_1_evaluation = r1_df1.evaluate_models()

### Models evaluation

In [12]:
r1_df_0_evaluation.sort_values(by='R2', ascending=False)

Unnamed: 0,Model,RMSE,MAPE,R2
0,xgboost,0.287302,3148615000000.0,-0.017256


In [15]:
r1_df_1_evaluation.sort_values(by='R2', ascending=False)

Unnamed: 0,Model,RMSE,MAPE,R2
0,xgboost,0.280952,41138450000000.0,-0.001303


### Check coefficients

In [16]:
r1_df_0_coefficients = r1_df0.get_coefficients(model_name='xgboost').sort_values(by='Importance', ascending=False)
r1_df_0_coefficients

Unnamed: 0,Importance
Season_Spring,0.152822
Purchase Amount (USD),0.136249
Size,0.134627
Previous Purchases,0.117241
Gender_Male,0.113718
Age,0.102707
Frequency of Purchases,0.093735
Discount Applied_Yes,0.078882
Season_Winter,0.070019
Category_Clothing,0.0


In [17]:
r1_df_1_coefficients = r1_df1.get_coefficients(model_name='xgboost').sort_values(by='Importance', ascending=False)
r1_df_1_coefficients

Unnamed: 0,Importance
Age,0.151293
Purchase Amount (USD),0.144764
Season_Winter,0.137296
Size,0.133915
Season_Spring,0.121975
Previous Purchases,0.116768
Category_Clothing,0.10221
Frequency of Purchases,0.091778
Gender_Male,0.0
Discount Applied_Yes,0.0


## Recursive feature elimination - 'Frequency of Purchases'

In [18]:
selecelected_features_frequency = apply_rfe(df_without_cluster, 'Frequency of Purchases')

In [19]:
selecelected_features_frequency

['Size',
 'Age',
 'Purchase Amount (USD)',
 'Review Rating',
 'Previous Purchases',
 'Gender_Male',
 'Category_Clothing',
 'Season_Spring',
 'Shipping Type_Free Shipping',
 'Shipping Type_Standard',
 'Frequency of Purchases']

## Regression on features selected in RFE 'Frequency od Purchases'

### Prepare datasets

In [20]:
df_0_r2 = df[df['cluster'] == 0]
df_0_r2 = df_0_r2.drop(columns=['cluster'])
df_0_r2_selected_features = df_0_r2[selecelected_features_frequency]

df_1_r2 = df[df['cluster'] == 1]
df_1_r2 = df_1_r2.drop(columns=['cluster'])
df_1_r2_selected_features = df_1_r2[selecelected_features_frequency]

### Perform regression

In [21]:
# Declare the regression analysis object
r2_df0 = RegressionAnalysis(data = df_0_selected_features, target_column = 'Frequency of Purchases')
r2_df1 = RegressionAnalysis(data = df_1_selected_features, target_column = 'Frequency of Purchases')

In [22]:
r2_df0.prepare_data()
#r2_df0.fit_linear_regression()
#r2_df0.tune_ridge_regression()
#r2_df0.tune_lasso_regression()
r2_df0.tune_xgboost()
#r2_df0.tune_random_forest()
r2_df_0_evaluation = r2_df0.evaluate_models()

In [23]:
r2_df1.prepare_data()
#r2_df1.fit_linear_regression()
#r2_df1.tune_ridge_regression()
#r2_df1.tune_lasso_regression()
r2_df1.tune_xgboost()
#r2_df1.tune_random_forest()
r2_df_1_evaluation = r2_df1.evaluate_models()

### Models evaluation

In [24]:
r2_df_0_evaluation.sort_values(by='R2', ascending=False)

Unnamed: 0,Model,RMSE,MAPE,R2
0,xgboost,0.682884,0.313487,0.002976


In [25]:
r2_df_1_evaluation.sort_values(by='R2', ascending=False)

Unnamed: 0,Model,RMSE,MAPE,R2
0,xgboost,0.834912,0.147462,-0.008593


### Check coefficients

In [26]:
r2_df_0_coefficients = r2_df0.get_coefficients(model_name='xgboost').sort_values(by='Importance', ascending=False)
r2_df_0_coefficients

Unnamed: 0,Importance
Purchase Amount (USD),0.194307
Review Rating,0.184881
Age,0.177713
Previous Purchases,0.16514
Size,0.158515
Season_Spring,0.119444
Gender_Male,0.0
Category_Clothing,0.0
Season_Winter,0.0
Discount Applied_Yes,0.0


In [27]:
r2_df_1_coefficients = r2_df1.get_coefficients(model_name='xgboost').sort_values(by='Importance', ascending=False)
r2_df_1_coefficients

Unnamed: 0,Importance
Season_Winter,0.165219
Gender_Male,0.156775
Age,0.136669
Purchase Amount (USD),0.126254
Season_Spring,0.123808
Previous Purchases,0.123127
Review Rating,0.100373
Category_Clothing,0.067776
Size,0.0
Discount Applied_Yes,0.0


## Conclusion

The best model in regression 1 (y - 'Review Rating'):
* df_0 - ridge
* df_1 - xgboost

The best model in regression 2 (y - 'Frequency od Purchases'):
* df_0 - xgboost
* df_1 - ridge

Finally we take everywhere xgboost to compare Importance score

### Save results

In [28]:
# convert index of r1_df_0_coefficients to column
r1_df_0_coefficients.reset_index(level=0, inplace=True)
r1_df_1_coefficients.reset_index(level=0, inplace=True)

r2_df_0_coefficients.reset_index(level=0, inplace=True)
r2_df_1_coefficients.reset_index(level=0, inplace=True)

In [29]:
# change name of columns from 'Index' to 'Feature'
r1_df_0_coefficients.rename(columns={'index': 'Feature'}, inplace=True)
r1_df_1_coefficients.rename(columns={'index': 'Feature'}, inplace=True)

r2_df_0_coefficients.rename(columns={'index': 'Feature'}, inplace=True)
r2_df_1_coefficients.rename(columns={'index': 'Feature'}, inplace=True)

In [30]:
r1_df_0_coefficients['regression_id'] = 1
r1_df_0_coefficients['df_id'] = 0
r1_df_0_coefficients['model'] = 'xgboost'

r1_df_1_coefficients['regression_id'] = 1
r1_df_1_coefficients['df_id'] = 1
r1_df_1_coefficients['model'] = 'xgboost'

r2_df_0_coefficients['regression_id'] = 2
r2_df_0_coefficients['df_id'] = 0
r2_df_0_coefficients['model'] = 'xgboost'

r2_df_1_coefficients['regression_id'] = 2
r2_df_1_coefficients['df_id'] = 1
r2_df_1_coefficients['model'] = 'xgboost'

In [31]:
r1_df_0_coefficients.to_csv('../results/regression/r1_df_0_coefficients.csv', index = False)
r1_df_1_coefficients.to_csv('../results/regression/r1_df_1_coefficients.csv', index = False)

r2_df_0_coefficients.to_csv('../results/regression/r2_df_0_coefficients.csv', index = False)
r2_df_1_coefficients.to_csv('../results/regression/r2_df_1_coefficients.csv', index = False)

In [32]:
# merge all coefficients
all_coefficients = pd.concat([r1_df_0_coefficients, r1_df_1_coefficients, r2_df_0_coefficients, r2_df_1_coefficients])
all_coefficients.to_csv('../results/regression/all_coefficients.csv', index = False)