In [None]:
from transformers import BertModel, BertTokenizer
import torch

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenized_dict = tokenizer.encode_plus(
    "hi my name is kashif",
    add_special_tokens=True,
    max_length=5,
    return_overflowing_tokens=True,
    return_special_tokens_mask=True
    )

In [None]:
tokenized_dict

In [None]:
bert_model = BertModel.from_pretrained("bert-base-uncased")
tokenized_text = torch.tensor(tokenized_dict["input_ids"])
with torch.no_grad():
    embeddings = bert_model(torch.tensor(tokenized_text.unsqueeze(0)))

In [None]:
from typing import Callable, List, Optional, Tuple

import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
import torch


class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            bert_tokenizer,
            bert_model,
            max_length: int = 200,
            embedding_func: Optional[Callable[[torch.tensor], torch.tensor]] = None,
    ):
        self.tokenizer = bert_tokenizer
        self.model = bert_model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func

        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text: str) -> Tuple[torch.tensor, torch.tensor]:
        # Tokenize the text with the provided tokenizer
        tokenized_text = self.tokenizer.encode_plus(text,
                                                    add_special_tokens=True,
                                                    max_length=self.max_length
                                                    )["input_ids"]

        # Create an attention mask telling BERT to use all words
        attention_mask = [1] * len(tokenized_text)

        # bert takes in a batch so we need to unsqueeze the rows
        return (
            torch.tensor(tokenized_text).unsqueeze(0),
            torch.tensor(attention_mask).unsqueeze(0),
        )

    def _tokenize_and_predict(self, text: str) -> torch.tensor:
        tokenized, attention_mask = self._tokenize(text)

        embeddings = self.model(tokenized, attention_mask)
        return self.embedding_func(embeddings)

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()

        with torch.no_grad():
            return torch.stack([self._tokenize_and_predict(string) for string in text])

    def fit(self, X, y=None):
        """No fitting necessary so we just return ourselves"""
        return self

In [None]:
project_folder="C:/Users/muhammadkashifkhan/Documents/ASDS_2nd/Thesis/output_kashif/"
output_folder=project_folder+"output_kashif"

## Normalize price and split dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv(output_folder+"/"+"all_after_preprocessing6.csv")#all_after_preprocessing6

scaler = MinMaxScaler()
df["price"] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(df["price"])))



# split train dataset into train, validation and test sets


In [None]:
df.loc[df['description'].isnull(), "description"] = ' '

In [None]:
df.loc[df['description'].isnull()]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['description'], df['price'], 
                                                                    random_state=13, 
                                                                    test_size=0.1)

In [None]:
import numpy as np 
from sklearn import ensemble

params = {'n_estimators': 1000,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}
y_train_flat=np.ravel(y_train)


regressor = ensemble.GradientBoostingRegressor(**params)#**params

In [None]:
from sklearn.pipeline import Pipeline


bert_transformer = BertTransformer(tokenizer, bert_model)

model_bert_gb = Pipeline(
    [
        ("vectorizer", bert_transformer),
        ("regressor", regressor),
    ]
)



## check what the transformed vectors look like

In [None]:
X_train[X_train.isnull()==True]

In [None]:
BERT=BertTransformer(tokenizer, bert_model)
transformed_X_train=BERT.transform(X_train)

In [None]:
transformed_X_train

In [None]:
transformed_X_train = np.asarray(transformed_X_train).astype(np.float32)
#X_test = np.asarray(X_test).astype(np.float32)

In [None]:
transformed_X_train.min()

In [None]:
transformed_X_train.max()

In [None]:
y_train.shape

## Grid Search
### Long Short Term Memory, when only description data is used

#### Function to create model, required for KerasClassifier

In [None]:
import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD

def create_model(learn_rate=0.001, amsgrad=False, activation='relu', dropout_rate=0.0, neurons=50):
    # create model
    # The maximum number of words to be used. (most frequent)
    #MAX_NB_WORDS = 50000
    # embedding dimension
    #EMBEDDING_DIM = 100
    #model.add(Dense(1024, activation='relu', input_shape=(X_train.shape[1],)))
    #model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    #model.add(LSTM(50))

    model = Sequential()
    model.add(Dense(neurons, activation=activation)) #input_shape=(X_train.shape[1],), return_sequences = True
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons*2, activation=activation))
    model.add(Dropout(dropout_rate/2))
    model.add(Dense(neurons, activation=activation))

    
    model.add(Dense(1, activation='sigmoid')) #

    # Compile model
    optimizer = Adam(learning_rate=learn_rate, amsgrad=amsgrad)
    model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])
    return model

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# split into input (X) and output (Y) variables
# create model
model = KerasRegressor(build_fn=create_model, verbose=10) #epochs=75, batch_size=10, verbose=10)
# define the grid search parameters
#optimizer = ['Adam'] 
batch_size = [10,20] # 5, 
epochs = [50, 75, 100] # ,
learn_rate = [0.0001,0.001,0.01] #0.0001, , 0.01
amsgrad = [False] # True,  #True,
activation = ['relu', 'sigmoid']#, 'softplus'] #, 'sigmoid','softplus'] #, , 'softsign', 'hard_sigmoid', 'softmax', #, 'linear' 
dropout_rate = [0.1,0.2] #,0.3]#, 0.2] #0.0,, 0.3, 0.5 0.4, 0.2,, 0.3, 0.4, 0.5, 0.7
neurons = [50, 100] #25, 50, 100, 150,300, 200


param_grid = dict(batch_size=batch_size, epochs=epochs, learn_rate=learn_rate, amsgrad=amsgrad, activation=activation, dropout_rate=dropout_rate, neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5, verbose=10, scoring=('r2', 'neg_root_mean_squared_error'), refit='r2')
grid_result = grid.fit(transformed_X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
print("-------------------------------------------------------------------")
print("Neural Network Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid.best_score_)
print("\n The best parameters across ALL searched params:\n",grid.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid.cv_results_['params']
r2_scores=grid.cv_results_['mean_test_r2']
rmse_scores=grid.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


In [None]:
gd_result = pd.DataFrame(grid.cv_results_)
gd_result=gd_result[['param_batch_size','param_epochs', 'param_neurons','param_activation','param_learn_rate', 'param_dropout_rate', 'mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]
gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

In [None]:
gd_result.to_csv(output_folder+"/"+"gridsearch_NN_BERT_descriptiononly_5fold_bound.csv", index=False)

## Grid Search
### Random Forest, when only description data is used

In [None]:
import numpy as np
from sklearn import ensemble
GBR = ensemble.RandomForestRegressor()


parameters = {'bootstrap': [False], #True, 
              'max_depth': [30],  #5, 10, 20,, None
              'max_features': ['sqrt'], #'auto',
              'n_estimators': [500]} #32, 64, 100, 1000

y_train_flat=np.ravel(y_train)

from sklearn.model_selection import GridSearchCV
grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters,scoring=('r2', 'neg_root_mean_squared_error'),refit='r2', verbose=10, cv = 5, n_jobs=-1)
grid_GBR.fit(transformed_X_train, y_train_flat)

print("-------------------------------------------------------------------")
print("Gradient Boosting Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid_GBR.cv_results_['params']
r2_scores=grid_GBR.cv_results_['mean_test_r2']
rmse_scores=grid_GBR.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


df_gridsearch_result = pd.DataFrame(grid_GBR.cv_results_)


gd_result=df_gridsearch_result[['param_n_estimators','param_max_depth','param_max_features','param_bootstrap','mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]

In [None]:
gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

In [None]:
gd_result.to_csv(output_folder+"/"+"gridsearch_randomforest_BERT_descriptiononly_5fold_Lem.csv", index=False)

## Grid Search
### Gradient Boosting, using only description data

import numpy as np
from sklearn import ensemble
GBR = ensemble.GradientBoostingRegressor()

parameters = {'n_estimators' : [500,1000], # 100 removed
              'max_depth'    : [4,6], # 3 removed
                                       #'min_samples_split': [2, 5, 8],
              'learning_rate': [0.01,0.02], # 0.005 removed
                                     #'loss': ['ls'], # remove huber loss
              'subsample'    : [1, 0.8] 
             }

y_train_flat=np.ravel(y_train)

from sklearn.model_selection import GridSearchCV
grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters,scoring=('r2', 'neg_root_mean_squared_error'),refit='r2', verbose=10, cv = 5, n_jobs=-1)
grid_GBR.fit(transformed_X_train, y_train_flat)

print("-------------------------------------------------------------------")
print("Gradient Boosting Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid_GBR.cv_results_['params']
r2_scores=grid_GBR.cv_results_['mean_test_r2']
rmse_scores=grid_GBR.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


df_gridsearch_result = pd.DataFrame(grid_GBR.cv_results_)


gd_result=df_gridsearch_result[['param_n_estimators','param_max_depth','param_learning_rate','param_subsample','mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]

gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

gd_result.to_csv(output_folder+"/"+"gridsearch_gradientboosting_BERT_descriptiononly_5fold.csv", index=False)

## use only the description data to predict

## Check how long it takes to finish the cross-validation
import time
tic = time.perf_counter()

from sklearn.model_selection import cross_validate
scores = cross_validate(regressor, transformed_X_train, y_train, scoring=('r2', 'neg_root_mean_squared_error'), cv=10, return_train_score=True)

print("RMSE training Score using cv: {:0.5f}".format(scores['train_neg_root_mean_squared_error'].mean() * -1))

print("RMSE test Score using cv: {:0.5f}".format(scores['test_neg_root_mean_squared_error'].mean() * -1))

print("R2 training Score using cv: {:0.5f}".format(scores['train_r2'].mean()))

print("R2 test Score using cv: {:0.5f}".format(scores['test_r2'].mean()))

toc = time.perf_counter()
print(f"Finish cross validation in  {(toc - tic)/60:0.2f} minutes")

## Try to use all features to predict

## description word vectors


In [None]:
BERT=BertTransformer(tokenizer, bert_model)

transformed_X_train=BERT.transform(df["description"])

In [None]:
transformed_X_train=transformed_X_train.numpy()
df_desc=pd.DataFrame(transformed_X_train)

In [None]:
df_desc.shape

In [None]:
df_desc.head()

## numerical features

In [None]:
numerical_features=["bedrooms","baths", 'size', 'longitude', "latitude"]

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_num=df[numerical_features]
X_num.head()

## Normalization for numerical data (exclude longitude and latitude) using MinMaxScaler


In [None]:
from sklearn.preprocessing import MinMaxScaler
# recaling the variables (both)
X_num_columns = X_num.columns
scaler = MinMaxScaler()
X_num = scaler.fit_transform(X_num)

# rename columns (since now its an np array)
X_num = pd.DataFrame(X_num)
X_num.columns = X_num_columns



## Normalization for longitude and latitude sepeparately

In [None]:
X_num.drop(["longitude", "latitude"], axis=1)
normed_long= df["longitude"] *0.01
normed_lat= df["latitude"] *0.01
X_num=pd.concat([X_num, normed_long, normed_lat], axis=1)

## Boolean features

## Convert categorical data with string values into numerical values

In [None]:
X_category=df[['location']]

In [None]:
## convert categorical data to numerical values
cate_features=['location']
for col in cate_features:
    X_category[col] = X_category[col].astype('category')
    X_category[col] = X_category[col].cat.codes

In [None]:
X_category.head()

## Normalize the categorical data 

In [None]:
from sklearn.preprocessing import MinMaxScaler

# recaling the variables (both)
X_category_columns = X_category.columns
scaler = MinMaxScaler()
X_category = scaler.fit_transform(X_category)

# rename columns (since now its an np array)
X_category = pd.DataFrame(X_category)
X_category.columns = X_category_columns

X_category.head()

## Use numerical, categorical, and description data to predict

In [None]:
X_all = pd.concat([X_num, X_category, df_desc], axis=1)
X_all.head()

## divide dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_all, df["price"], test_size=0.1, random_state=13) 

In [None]:
X_train = np.asarray(X_train).astype(np.float32)
#X_test = np.asarray(X_test).astype(np.float32)

## Grid Search
### Long Short Term Memory, when only description data is used

#### Function to create model, required for KerasClassifier

In [None]:
import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD

def create_model(learn_rate=0.001, amsgrad=False, activation='relu', dropout_rate=0.0, neurons=50):
    # create model
    # The maximum number of words to be used. (most frequent)
    #MAX_NB_WORDS = 50000
    # embedding dimension
    #EMBEDDING_DIM = 100
    #model.add(Dense(1024, activation='relu', input_shape=(X_train.shape[1],)))
    #model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    #model.add(LSTM(50))

    model = Sequential()
    model.add(Dense(neurons, activation=activation)) #input_shape=(X_train.shape[1],), return_sequences = True
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons*2, activation=activation))
    model.add(Dropout(dropout_rate/2))
    model.add(Dense(neurons, activation=activation))

    
    model.add(Dense(1, activation='sigmoid')) #

    # Compile model
    optimizer = Adam(learning_rate=learn_rate, amsgrad=amsgrad)
    model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])
    return model

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# split into input (X) and output (Y) variables
# create model
model = KerasRegressor(build_fn=create_model, verbose=10) #epochs=75, batch_size=10, verbose=10)
# define the grid search parameters
#optimizer = ['Adam'] 
batch_size = [10,20] # 5, 
epochs = [50, 75, 100] # ,
learn_rate = [0.0001,0.001,0.01] #0.0001, , 0.01
amsgrad = [False] # True,  #True,
activation = ['relu', 'sigmoid']#, 'softplus'] #, 'sigmoid','softplus'] #, , 'softsign', 'hard_sigmoid', 'softmax', #, 'linear' 
dropout_rate = [0.1,0.2] #,0.3]#, 0.2] #0.0,, 0.3, 0.5 0.4, 0.2,, 0.3, 0.4, 0.5, 0.7
neurons = [50, 100] #25, 50, 100, 150,300, 200


param_grid = dict(batch_size=batch_size, epochs=epochs, learn_rate=learn_rate, amsgrad=amsgrad, activation=activation, dropout_rate=dropout_rate, neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5, verbose=10, scoring=('r2', 'neg_root_mean_squared_error'), refit='r2')
grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
print("-------------------------------------------------------------------")
print("Neural Network Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid.best_score_)
print("\n The best parameters across ALL searched params:\n",grid.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid.cv_results_['params']
r2_scores=grid.cv_results_['mean_test_r2']
rmse_scores=grid.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


In [None]:
gd_result = pd.DataFrame(grid.cv_results_)
gd_result=gd_result[['param_batch_size','param_epochs', 'param_neurons','param_activation','param_learn_rate', 'param_dropout_rate', 'mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]
gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

In [None]:
gd_result.to_csv(output_folder+"/"+"gridsearch_NN_BERT_all_5fold_bound.csv", index=False)

## Grid Search
### Random Forest, when all features are used

In [None]:
import numpy as np
from sklearn import ensemble
GBR = ensemble.RandomForestRegressor()


parameters = {'bootstrap': [True, False],
              'max_depth': [10, 20, 30, None], #5
              'max_features': ['auto', 'sqrt'],
              'n_estimators': [32, 64, 100, 500]} #1000

y_train_flat=np.ravel(y_train)

from sklearn.model_selection import GridSearchCV
grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters,scoring=('r2', 'neg_root_mean_squared_error'),refit='r2', verbose=10, cv = 5, n_jobs=-1)
grid_GBR.fit(X_train, y_train_flat)

print("-------------------------------------------------------------------")
print("Gradient Boosting Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid_GBR.cv_results_['params']
r2_scores=grid_GBR.cv_results_['mean_test_r2']
rmse_scores=grid_GBR.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


df_gridsearch_result = pd.DataFrame(grid_GBR.cv_results_)


gd_result=df_gridsearch_result[['param_n_estimators','param_max_depth','param_max_features','param_bootstrap','mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]

In [None]:
gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

In [None]:
gd_result.to_csv(output_folder+"/"+"gridsearch_randomforest_BERT_all_5fold.csv", index=False)

## Grid Search
### Gradient Boosting, using all data

In [None]:
import numpy as np
from sklearn import ensemble
GBR = ensemble.GradientBoostingRegressor()

parameters = {'n_estimators' : [500,1000], # 100 removed
              'max_depth'    : [4,6], # 3 removed
                                       #'min_samples_split': [2, 5, 8],
              'learning_rate': [0.01,0.02], # 0.005 removed
                                     #'loss': ['ls'], # remove huber loss
              'subsample'    : [1, 0.8] 
             }

y_train_flat=np.ravel(y_train)

from sklearn.model_selection import GridSearchCV
grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters,scoring=('r2', 'neg_root_mean_squared_error'),refit='r2', verbose=10, cv = 5) #, n_jobs=-1
grid_GBR.fit(X_train, y_train_flat)

print("-------------------------------------------------------------------")
print("Gradient Boosting Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid_GBR.cv_results_['params']
r2_scores=grid_GBR.cv_results_['mean_test_r2']
rmse_scores=grid_GBR.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


df_gridsearch_result = pd.DataFrame(grid_GBR.cv_results_)


gd_result=df_gridsearch_result[['param_n_estimators','param_max_depth','param_learning_rate','param_subsample','mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]

In [None]:
gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

In [None]:
gd_result.to_csv(output_folder+"/"+"gridsearch_gradientboosting_BERT_all_5fold.csv", index=False)

## create gradient boosting model

In [None]:
import numpy as np 
params = {'n_estimators': 1000,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}

reg = ensemble.GradientBoostingRegressor(**params)#**params

y_train_flat=np.ravel(y_train)
reg.fit(X_train, y_train_flat)

## do cross validation

In [None]:
import time

tic = time.perf_counter()

In [None]:
from sklearn.model_selection import cross_validate
scores = cross_validate(reg, X_train, y_train_flat, scoring=('r2', 'neg_root_mean_squared_error'), cv=10, return_train_score=True)

In [None]:
print("RMSE training Score using cv: {:0.5f}".format(scores['train_neg_root_mean_squared_error'].mean() * -1))

In [None]:
print("RMSE test Score using cv: {:0.5f}".format(scores['test_neg_root_mean_squared_error'].mean() * -1))

In [None]:
print("R2 training Score using cv: {:0.5f}".format(scores['train_r2'].mean() * -1))

In [None]:
print("R2 test Score using cv: {:0.5f}".format(scores['test_r2'].mean() * -1))

In [None]:
toc = time.perf_counter()
print(f"Finish cross validation in  {(toc - tic)/60:0.4f} minutes")