### _Imports_

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model  import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import AdaBoostRegressor

from xgboost import XGBRegressor

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

import seaborn as sns
import matplotlib.pylab as plt

import functions as fn # importing pre-defined functions from .py file (inside notebooks folder)

### Upload diamonds train data from csv

In [3]:
diamonds_train = pd.read_csv('../data/diamonds_train.csv')
#diamonds_train.head()

**Remark**: For the diamonds dataset, we have conducted an exploratory data analysis (EDA) in advance. Find references [here](https://github.com/Kristinawk/EDA_Diamonds).  
  
This dataset doesn't have nulls, therefore we can proceed directly with encoding.

## 1. Features selection and engineering

During EDA we saw that some diamonds have ceros in size parameters x, y and z. It would be better to take these rows out for machine learning.

In [4]:
diamonds_train.describe()

Unnamed: 0,price,carat,depth,table,x,y,z
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,3928.444469,0.797706,61.752841,57.446133,5.729392,5.732819,3.537154
std,3992.416147,0.475544,1.431725,2.233535,1.124453,1.14665,0.697062
min,326.0,0.2,43.0,43.0,0.0,0.0,0.0
25%,945.0,0.4,61.0,56.0,4.71,4.72,2.91
50%,2397.0,0.7,61.8,57.0,5.69,5.71,3.52
75%,5331.0,1.04,62.5,59.0,6.54,6.54,4.035
max,18823.0,4.5,79.0,95.0,10.23,58.9,8.06


In [5]:
len(diamonds_train.query('x == 0 or y == 0 or z == 0'))

16

In total, there are 16 rows with this issue. Lets drop these values.

In [6]:
diamonds_train.drop(diamonds_train.query('x == 0 or y == 0 or z == 0').index, inplace=True)

In [7]:
diamonds_train.describe()

Unnamed: 0,price,carat,depth,table,x,y,z
count,40439.0,40439.0,40439.0,40439.0,40439.0,40439.0,40439.0
mean,3926.535448,0.797425,61.753013,57.445543,5.729616,5.732897,3.538553
std,3990.024501,0.475257,1.431306,2.233055,1.122384,1.145002,0.693639
min,326.0,0.2,43.0,43.0,3.77,3.72,1.07
25%,945.0,0.4,61.0,56.0,4.71,4.72,2.91
50%,2396.0,0.7,61.8,57.0,5.69,5.71,3.52
75%,5329.5,1.04,62.5,59.0,6.54,6.54,4.04
max,18823.0,4.5,79.0,95.0,10.23,58.9,8.06


Additionally we can do outliers analysis for these variables.

In [8]:
for col in ['x', 'y', 'z']:
    q1 = diamonds_train[col].quantile(0.25)
    q3 = diamonds_train[col].quantile(0.75)
    iqr = q3 - q1
    upper = q3 + 1.5 * iqr
    upper_array = np.array(diamonds_train[col] >= upper)
    lower = q1 - 1.5 * iqr
    lower_array = np.array(diamonds_train[col] <= lower)
    print(col, "Upper bound:", upper, "Lower bound:", lower)
    print(col, "Upper outliers", upper_array.sum(), "Lower outliers", lower_array.sum(), '\n')

x Upper bound: 9.285 Lower bound: 1.9649999999999999
x Upper outliers 18 Lower outliers 0 

y Upper bound: 9.27 Lower bound: 1.9899999999999993
y Upper outliers 18 Lower outliers 0 

z Upper bound: 5.734999999999999 Lower bound: 1.2150000000000003
z Upper outliers 19 Lower outliers 1 



In [9]:
len(diamonds_train.query('x > 9.28 or y > 9.27 or z > 5.73'))

27

Lets take out also these 27 rows.

In [10]:
diamonds_train.drop(diamonds_train.query('x > 9.28 or y > 9.27 or z > 5.73').index, inplace=True)

### New features

In [11]:
#diamonds_train['volume'] = diamonds_train['x'] * diamonds_train['y'] * diamonds_train['z']

In [12]:
#diamonds_train['tot_depth_percentage'] = diamonds_train['z'] * diamonds_train['x']

In [13]:
diamonds_train['fletin'] = diamonds_train['z'] - ((diamonds_train['depth']/100) * diamonds_train['z'])

In [14]:
diamonds_train.head()

Unnamed: 0,price,carat,city,depth,table,x,y,z,cut,color,clarity,fletin
0,4268,1.21,Dubai,62.4,58.0,6.83,6.79,4.25,Premium,J,VS2,1.598
1,505,0.32,Kimberly,63.0,57.0,4.35,4.38,2.75,Very Good,H,VS2,1.0175
2,2686,0.71,Las Vegas,65.5,55.0,5.62,5.53,3.65,Fair,G,VS1,1.25925
3,738,0.41,Kimberly,63.8,56.0,4.68,4.72,3.0,Good,D,SI1,1.086
4,4882,1.02,Dubai,60.5,59.0,6.55,6.51,3.95,Ideal,G,SI1,1.56025


In [None]:
cat_var_lst = ["city", "cut", "color", "clarity"]
non_cat_var = diamonds_train.drop(cat_var_lst, axis=1)

In [None]:
non_cat_var.corr()

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
ax = sns.heatmap(non_cat_var.corr(),
                 vmin=-1,
                 vmax=1,
                 cmap=sns.diverging_palette(20, 220, as_cmap=True),
                 ax=ax)
plt.tight_layout()

In [None]:
non_cat_var.describe()

In [None]:
diamonds_train.drop(['x', 'y', 'z', 'depth'], axis=1, inplace=True)

In [None]:
cat_var_lst = ["city", "cut", "color", "clarity"]
non_cat_var = diamonds_train.drop(cat_var_lst, axis=1)

In [None]:
non_cat_var.corr()

## 2. Encoding

### Categorical variables overview

In [None]:
cat_var_lst = ["city", "cut", "color", "clarity"]
#cat_var_overview = fn.cat_var(diamonds_train, cat_var_lst)
#cat_var_overview

### One hot encoding

In [None]:
baseline_train = fn.one_hot_encod(diamonds_train, cat_var_lst)

In [None]:
#baseline_train.head()

Checking that all features are numeric

In [None]:
#baseline_train.info()

### Save pre-processing to csv

In [None]:
#baseline_train.to_csv('../data/baseline_train.csv', index=False)

## 3. Model selection

In [None]:
y = baseline_train['price'].astype('float64').to_numpy()
X = baseline_train[[col for col in baseline_train.columns if col != 'price']].to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
#print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
#print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

## 3. e) RandomForestRegressor

In [None]:
model_randomforest = RandomForestRegressor()

fn.cross_val(model_randomforest, X, y)

## 3. g) ExtraTreesRegressor

In [None]:
model_extratrees = ExtraTreesRegressor()

fn.cross_val(model_extratrees, X, y)

**CONCLUSION**: the changes in features in this notebook improve model prediction: from 662 to 657 for RandomForest and from 613 to 612 for ExtraTrees.

### Grid Search: RandomForestRegressor

In [None]:
'''param_grid = {'n_estimators': [100, 200, 300],  # Number of trees in the forest.
              'max_depth': [None, 3, 10],  # Maximum depth of the trees.
              'min_samples_split': [2, 10],  # Minimum number of samples required to split an internal node.
              'min_samples_leaf': [1, 4],  # Minimum number of samples required to be at a leaf node.
              'max_features': [None, 'sqrt', 'log2']  # Number of features to consider when looking for the best split.
              }'''

In [None]:
'''grid_search = GridSearchCV(model_randomforest,
                           param_grid,
                           cv=5,
                           verbose=3,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1)'''

In [None]:
#grid_search.fit(X, y)

#print('\n')
#print('Best hyperparameters: ', grid_search.best_params_, '\n')
#print('Best score: ', -grid_search.best_score_, '\n')

### Grid Search: ExtraTreesRegressor

In [None]:
'''param_grid = {'n_estimators': [100],  # Number of trees in the forest.
              'max_depth': [None],  # Maximum depth of the trees.
              'min_samples_split': [6],  # Minimum number of samples required to split an internal node.
              'min_samples_leaf': [2],  # Minimum number of samples required to be at a leaf node.
              'max_features': [None]  # Number of features to consider when looking for the best split.
              }'''

In [None]:
'''grid_search = GridSearchCV(model_extratrees,
                           param_grid,
                           cv=5,
                           verbose=3,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1)'''

In [None]:
#grid_search.fit(X, y)

#print('\n')
#print('Best hyperparameters: ', grid_search.best_params_, '\n')
#print('Best score: ', -grid_search.best_score_, '\n')

### Fit Model

In [None]:
model_extratrees.fit(X_train, y_train)
y_pred = model_extratrees.predict(X_test)

### Visual check

In [None]:
check = pd.DataFrame({'Ground truth': y_test, 'Predictions': y_pred, 'Diff': y_test - y_pred})
check

In [None]:
check.reset_index(inplace=True)

In [None]:
check.plot(x='index', y=['Ground truth', 'Predictions'], kind='line', figsize=(12, 3));

# 4. Predictions

### Upload diamonds test data from csv

In [None]:
diamonds_test = pd.read_csv('../data/diamonds_test.csv')
#diamonds_test.head()

### Features engineering

In [None]:
#diamonds_test['volume'] = diamonds_test['x'] * diamonds_test['y'] * diamonds_test['z']

In [None]:
diamonds_test['fletin'] = diamonds_test['z'] - ((diamonds_test['depth']/100) * diamonds_test['z'])

In [None]:
diamonds_test.drop(['x', 'y', 'z', 'depth'], axis=1, inplace=True)

### One-hot-encoding

In [None]:
baseline_test = fn.one_hot_encod(diamonds_test, cat_var_lst)

In [None]:
baseline_test = baseline_test.drop('id', axis=1)

In [None]:
#baseline_test.head()

### Predict: ExtraTreesRegressor

In [None]:
extratrees_model_pred = model_extratrees.predict(baseline_test)

In [None]:
extratrees_model_pred

### Save

In [None]:
fn.save_pred(diamonds_test, extratrees_model_pred, 'encod01_featu04_vis3_model01')