In [1]:
import pandas as pd
from utils.regression import RegressionAnalysis
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from utils.rfe import apply_rfe

In [2]:
df = pd.read_csv('../data/cleaned_data_cluster.csv')
# select df without cluster column
df_without_cluster = df.drop(columns=['cluster'])

## Recursive feature elimination - 'Review Rating'

In [3]:
seleselected_features_rating = apply_rfe(df_without_cluster, 'Review Rating')

In [4]:
seleselected_features_rating

['Size',
 'Frequency of Purchases',
 'Age',
 'Purchase Amount (USD)',
 'Previous Purchases',
 'Gender_Male',
 'Category_Clothing',
 'Season_Spring',
 'Season_Summer',
 'Discount Applied_Yes',
 'Review Rating']

## Regression on features selected in RFE - 'Review Rating'

### Prepare datasets

In [5]:
df_0 = df[df['cluster'] == 0]
df_0 = df_0.drop(columns=['cluster'])
df_0_selected_features = df_0[seleselected_features_rating]

df_1 = df[df['cluster'] == 1]
df_1 = df_1.drop(columns=['cluster'])
df_1_selected_features = df_1[seleselected_features_rating]

### Perform regression

In [6]:
# Declare the regression analysis object
r1_df0 = RegressionAnalysis(data = df_0_selected_features, target_column = 'Review Rating')
r1_df1 = RegressionAnalysis(data = df_1_selected_features, target_column = 'Review Rating')

In [7]:
r1_df0.prepare_data()
r1_df0.fit_linear_regression()
r1_df0.tune_ridge_regression()
r1_df0.tune_lasso_regression()
r1_df0.tune_xgboost()
r1_df0.tune_random_forest()
r1_df_0_evaluation = r1_df0.evaluate_models()

In [8]:
r1_df1.prepare_data()
r1_df1.fit_linear_regression()
r1_df1.tune_ridge_regression()
r1_df1.tune_lasso_regression()
r1_df1.tune_xgboost()
r1_df1.tune_random_forest()
r1_df_1_evaluation = r1_df1.evaluate_models()

### Models evaluation

In [9]:
r1_df_0_evaluation.sort_values(by='R2', ascending=False)

Unnamed: 0,Model,RMSE,MAPE,R2
1,ridge,0.286592,3082892000000.0,-0.01223
2,lasso,0.286878,3252254000000.0,-0.014253
0,linear,0.286917,2976683000000.0,-0.014529
3,xgboost,0.287297,3152886000000.0,-0.017221
4,random_forest,0.294252,3115572000000.0,-0.067069


In [10]:
r1_df_1_evaluation.sort_values(by='R2', ascending=False)

Unnamed: 0,Model,RMSE,MAPE,R2
3,xgboost,0.280828,41031810000000.0,-0.000416
2,lasso,0.281128,40773200000000.0,-0.002554
1,ridge,0.2817,41301400000000.0,-0.006637
0,linear,0.282319,41363980000000.0,-0.011069
4,random_forest,0.287313,40249040000000.0,-0.047155


### Check coefficients

In [101]:
r1_df_0_coefficients = r1_df0.get_coefficients(model_name='ridge').sort_values(by='Coefficient', ascending=False)
r1_df_0_coefficients

Unnamed: 0,Coefficient
Purchase Amount (USD),0.039061
Season_Spring,0.011705
Season_Summer,0.008859
Gender_Male,0.005944
Size,0.002664
Category_Clothing,0.000997
Frequency of Purchases,0.000618
Discount Applied_Yes,-0.007269
Previous Purchases,-0.012278
Age,-0.032116


In [102]:
r1_df_1_coefficients = r1_df1.get_coefficients(model_name='xgboost').sort_values(by='Importance', ascending=False)
r1_df_1_coefficients

Unnamed: 0,Importance
Size,0.239664
Age,0.131583
Purchase Amount (USD),0.118664
Season_Spring,0.117237
Season_Summer,0.110827
Previous Purchases,0.099604
Category_Clothing,0.092562
Frequency of Purchases,0.089858
Gender_Male,0.0
Discount Applied_Yes,0.0


## Recursive feature elimination - 'Frequency of Purchases'

In [13]:
selecelected_features_frequency = apply_rfe(df_without_cluster, 'Frequency of Purchases')

In [14]:
selecelected_features_frequency

['Size',
 'Age',
 'Purchase Amount (USD)',
 'Review Rating',
 'Previous Purchases',
 'Gender_Male',
 'Category_Clothing',
 'Season_Spring',
 'Shipping Type_Standard',
 'Discount Applied_Yes',
 'Frequency of Purchases']

## Regression on features selected in RFE 'Frequency od Purchases'

### Prepare datasets

In [73]:
df_0_r2 = df[df['cluster'] == 0]
df_0_r2 = df_0_r2.drop(columns=['cluster'])
df_0_r2_selected_features = df_0_r2[selecelected_features_frequency]

df_1_r2 = df[df['cluster'] == 1]
df_1_r2 = df_1_r2.drop(columns=['cluster'])
df_1_r2_selected_features = df_1_r2[selecelected_features_frequency]

### Perform regression

In [74]:
# Declare the regression analysis object
r2_df0 = RegressionAnalysis(data = df_0_selected_features, target_column = 'Frequency of Purchases')
r2_df1 = RegressionAnalysis(data = df_1_selected_features, target_column = 'Frequency of Purchases')

In [75]:
r2_df0.prepare_data()
r2_df0.fit_linear_regression()
r2_df0.tune_ridge_regression()
r2_df0.tune_lasso_regression()
r2_df0.tune_xgboost()
r2_df0.tune_random_forest()
r2_df_0_evaluation = r2_df0.evaluate_models()

In [76]:
r2_df1.prepare_data()
r2_df1.fit_linear_regression()
r2_df1.tune_ridge_regression()
r2_df1.tune_lasso_regression()
r2_df1.tune_xgboost()
r2_df1.tune_random_forest()
r2_df_1_evaluation = r2_df1.evaluate_models()

### Models evaluation

In [77]:
r2_df_0_evaluation.sort_values(by='R2', ascending=False)

Unnamed: 0,Model,RMSE,MAPE,R2
3,xgboost,0.682884,0.313487,0.002976
2,lasso,0.684146,0.311927,-0.000711
1,ridge,0.68478,0.313905,-0.002567
0,linear,0.685118,0.315572,-0.003558
4,random_forest,0.701065,0.338365,-0.050819


In [78]:
r2_df_1_evaluation.sort_values(by='R2', ascending=False)

Unnamed: 0,Model,RMSE,MAPE,R2
2,lasso,0.831559,0.144026,-0.000507
1,ridge,0.834967,0.146315,-0.008725
3,xgboost,0.835019,0.147532,-0.008851
0,linear,0.837687,0.148145,-0.015308
4,random_forest,0.84952,0.151833,-0.044195


### Check coefficients

In [103]:
r2_df_0_coefficients = r2_df0.get_coefficients(model_name='xgboost').sort_values(by='Importance', ascending=False)
r2_df_0_coefficients

Unnamed: 0,Importance
Purchase Amount (USD),0.194307
Review Rating,0.184881
Age,0.177713
Previous Purchases,0.16514
Size,0.158515
Season_Spring,0.119444
Gender_Male,0.0
Category_Clothing,0.0
Season_Summer,0.0
Discount Applied_Yes,0.0


In [104]:
r2_df_1_coefficients = r2_df1.get_coefficients(model_name='ridge').sort_values(by='Coefficient', ascending=False)
r2_df_1_coefficients

Unnamed: 0,Coefficient
Age,0.06572
Review Rating,0.059374
Category_Clothing,0.056276
Gender_Male,0.018821
Previous Purchases,0.012866
Size,0.007504
Season_Spring,0.001404
Discount Applied_Yes,-0.009311
Season_Summer,-0.010969
Purchase Amount (USD),-0.052104


## Conclusion

The best model in regression 1 (y - 'Review Rating'):
* df_0 - ridge
* df_1 - xgboost

The best model in regression 2 (y - 'Frequency od Purchases'):
* df_0 - xgboost
* df_1 - ridge

In [39]:
r1_df_0_coefficients.to_csv('../results/regression/r1_df_0_coefficients.csv')

### Save results

In [105]:
# convert index of r1_df_0_coefficients to column
r1_df_0_coefficients.reset_index(level=0, inplace=True)
r1_df_1_coefficients.reset_index(level=0, inplace=True)

r2_df_0_coefficients.reset_index(level=0, inplace=True)
r2_df_1_coefficients.reset_index(level=0, inplace=True)

In [106]:
r2_df_0_coefficients

Unnamed: 0,index,Importance
0,Purchase Amount (USD),0.194307
1,Review Rating,0.184881
2,Age,0.177713
3,Previous Purchases,0.16514
4,Size,0.158515
5,Season_Spring,0.119444
6,Gender_Male,0.0
7,Category_Clothing,0.0
8,Season_Summer,0.0
9,Discount Applied_Yes,0.0


In [109]:
# change name of columns from 'Index' to 'Feature'
r1_df_0_coefficients.rename(columns={'index': 'Feature'}, inplace=True)
r1_df_1_coefficients.rename(columns={'index': 'Feature'}, inplace=True)

r2_df_0_coefficients.rename(columns={'index': 'Feature'}, inplace=True)
r2_df_1_coefficients.rename(columns={'index': 'Feature'}, inplace=True)

In [116]:
r1_df_0_coefficients['regression_id'] = 1
r1_df_0_coefficients['df_id'] = 0
r1_df_0_coefficients['model'] = 'ridge'

r1_df_1_coefficients['regression_id'] = 1
r1_df_1_coefficients['df_id'] = 1
r1_df_1_coefficients['model'] = 'xgboost'

r2_df_0_coefficients['regression_id'] = 2
r2_df_0_coefficients['df_id'] = 0
r2_df_0_coefficients['model'] = 'xgboost'

r2_df_1_coefficients['regression_id'] = 2
r2_df_1_coefficients['df_id'] = 1
r2_df_1_coefficients['model'] = 'ridge'

In [122]:
r1_df_0_coefficients.to_csv('../results/regression/r1_df_0_coefficients.csv', index = False)
r1_df_1_coefficients.to_csv('../results/regression/r1_df_1_coefficients.csv', index = False)

r2_df_0_coefficients.to_csv('../results/regression/r2_df_0_coefficients.csv', index = False)
r2_df_1_coefficients.to_csv('../results/regression/r2_df_1_coefficients.csv', index = False)

In [118]:
# merge all coefficients
all_coefficients = pd.concat([r1_df_0_coefficients, r1_df_1_coefficients, r2_df_0_coefficients, r2_df_1_coefficients])
all_coefficients.to_csv('../results/regression/all_coefficients.csv', index = False)