In [1]:
from transformers import BertModel, BertTokenizer
import torch

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenized_dict = tokenizer.encode_plus(
    "hi my name is nicolas",
    add_special_tokens=True,
    max_length=5,
    return_overflowing_tokens=True,
    return_special_tokens_mask=True
    )

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [3]:
tokenized_dict

{'overflowing_tokens': [9473, 2003], 'num_truncated_tokens': 2, 'input_ids': [101, 7632, 2026, 2171, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'special_tokens_mask': [1, 0, 0, 0, 1], 'attention_mask': [1, 1, 1, 1, 1]}

In [4]:
bert_model = BertModel.from_pretrained("bert-base-uncased")
tokenized_text = torch.tensor(tokenized_dict["input_ids"])
with torch.no_grad():
    embeddings = bert_model(torch.tensor(tokenized_text.unsqueeze(0)))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  embeddings = bert_model(torch.tensor(tokenized_text.unsqueeze(0)))


In [5]:
from typing import Callable, List, Optional, Tuple

import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
import torch


class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            bert_tokenizer,
            bert_model,
            max_length: int = 200,
            embedding_func: Optional[Callable[[torch.tensor], torch.tensor]] = None,
    ):
        self.tokenizer = bert_tokenizer
        self.model = bert_model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func

        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text: str) -> Tuple[torch.tensor, torch.tensor]:
        # Tokenize the text with the provided tokenizer
        tokenized_text = self.tokenizer.encode_plus(text,
                                                    add_special_tokens=True,
                                                    max_length=self.max_length
                                                    )["input_ids"]

        # Create an attention mask telling BERT to use all words
        attention_mask = [1] * len(tokenized_text)

        # bert takes in a batch so we need to unsqueeze the rows
        return (
            torch.tensor(tokenized_text).unsqueeze(0),
            torch.tensor(attention_mask).unsqueeze(0),
        )

    def _tokenize_and_predict(self, text: str) -> torch.tensor:
        tokenized, attention_mask = self._tokenize(text)

        embeddings = self.model(tokenized, attention_mask)
        return self.embedding_func(embeddings)

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()

        with torch.no_grad():
            return torch.stack([self._tokenize_and_predict(string) for string in text])

    def fit(self, X, y=None):
        """No fitting necessary so we just return ourselves"""
        return self

In [6]:
project_folder="C:/Users/hanson/OneDrive/Grad/Grad Project/data_realtorCA/"
output_folder=project_folder+"output"

## Normalize price and split dataset

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv(output_folder+"/"+"all_after_preprocessing6.csv")#all_after_preprocessing6

scaler = MinMaxScaler()
df["price"] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(df["price"])))



# split train dataset into train, validation and test sets


In [8]:
df.loc[df['description'].isnull(), "description"] = ' '

In [9]:
df.loc[df['description'].isnull()]

Unnamed: 0,MLS,price,city,communityName,address,postal,description,typeBuilding,title,bedroomAboveGrade,...,GolfNearby,HospitalNearby,PlaygroundNearby,ShoppingNearby,PublicTransitNearby,HighwayNearby,bedroom,bathroom,longitude,latitude


In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['description'], df['price'], 
                                                                    random_state=13, 
                                                                    test_size=0.1)

In [11]:
import numpy as np 
from sklearn import ensemble

params = {'n_estimators': 1000,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}
y_train_flat=np.ravel(y_train)


regressor = ensemble.GradientBoostingRegressor(**params)#**params

In [12]:
from sklearn.pipeline import Pipeline


bert_transformer = BertTransformer(tokenizer, bert_model)

model_bert_gb = Pipeline(
    [
        ("vectorizer", bert_transformer),
        ("regressor", regressor),
    ]
)



## check what the transformed vectors look like

In [13]:
X_train[X_train.isnull()==True]

Series([], Name: description, dtype: object)

In [14]:
BERT=BertTransformer(tokenizer, bert_model)
transformed_X_train=BERT.transform(X_train)

In [15]:
transformed_X_train

tensor([[-0.5048, -0.3873,  0.8852,  ..., -0.2312,  0.2080,  0.3050],
        [-0.2495, -0.2922,  0.7757,  ..., -0.3828,  0.3285, -0.0985],
        [-0.1385, -0.3944,  0.7950,  ..., -0.3175,  0.4449,  0.2223],
        ...,
        [-0.4830, -0.2967,  0.6361,  ..., -0.4736,  0.1251, -0.0283],
        [-0.5622, -0.1283,  1.1870,  ..., -0.6192,  0.2669, -0.1938],
        [-0.2557,  0.1742,  0.9036,  ..., -0.3457,  0.4156, -0.1141]])

In [16]:
transformed_X_train = np.asarray(transformed_X_train).astype(np.float32)
#X_test = np.asarray(X_test).astype(np.float32)

In [17]:
transformed_X_train.min()

-8.161861

In [18]:
transformed_X_train.max()

4.273582

In [19]:
y_train.shape

(9225,)

## Grid Search
### Long Short Term Memory, when only description data is used

#### Function to create model, required for KerasClassifier

In [20]:
import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD

def create_model(learn_rate=0.001, amsgrad=False, activation='relu', dropout_rate=0.0, neurons=50):
    # create model
    # The maximum number of words to be used. (most frequent)
    #MAX_NB_WORDS = 50000
    # embedding dimension
    #EMBEDDING_DIM = 100
    #model.add(Dense(1024, activation='relu', input_shape=(X_train.shape[1],)))
    #model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    #model.add(LSTM(50))

    model = Sequential()
    model.add(Dense(neurons, activation=activation)) #input_shape=(X_train.shape[1],), return_sequences = True
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons*2, activation=activation))
    model.add(Dropout(dropout_rate/2))
    model.add(Dense(neurons, activation=activation))

    
    model.add(Dense(1, activation='sigmoid')) #

    # Compile model
    optimizer = Adam(learning_rate=learn_rate, amsgrad=amsgrad)
    model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])
    return model

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# split into input (X) and output (Y) variables
# create model
model = KerasRegressor(build_fn=create_model, verbose=10) #epochs=75, batch_size=10, verbose=10)
# define the grid search parameters
#optimizer = ['Adam'] 
batch_size = [10,20] # 5, 
epochs = [50, 75, 100] # ,
learn_rate = [0.0001,0.001,0.01] #0.0001, , 0.01
amsgrad = [False] # True,  #True,
activation = ['relu', 'sigmoid']#, 'softplus'] #, 'sigmoid','softplus'] #, , 'softsign', 'hard_sigmoid', 'softmax', #, 'linear' 
dropout_rate = [0.1,0.2] #,0.3]#, 0.2] #0.0,, 0.3, 0.5 0.4, 0.2,, 0.3, 0.4, 0.5, 0.7
neurons = [50, 100] #25, 50, 100, 150,300, 200


param_grid = dict(batch_size=batch_size, epochs=epochs, learn_rate=learn_rate, amsgrad=amsgrad, activation=activation, dropout_rate=dropout_rate, neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5, verbose=10, scoring=('r2', 'neg_root_mean_squared_error'), refit='r2')
grid_result = grid.fit(transformed_X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 5 folds for each of 144 candidates, totalling 720 fits
Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75
Best: 0.471183 using {'activation': 'relu', 'a

In [21]:
print("-------------------------------------------------------------------")
print("Neural Network Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid.best_score_)
print("\n The best parameters across ALL searched params:\n",grid.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid.cv_results_['params']
r2_scores=grid.cv_results_['mean_test_r2']
rmse_scores=grid.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


-------------------------------------------------------------------
Neural Network Grid Search Results
-------------------------------------------------------------------

 The best estimator across ALL searched params:
 <tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor object at 0x000001E7051B34F0>

 The best r2 score across ALL searched params:
 0.471183213564523

 The best parameters across ALL searched params:
 {'activation': 'relu', 'amsgrad': False, 'batch_size': 20, 'dropout_rate': 0.1, 'epochs': 75, 'learn_rate': 0.0001, 'neurons': 50}


-------------------------------------------------------------------
All Results:
-------------------------------------------------------------------
parameter combinations:{'activation': 'relu', 'amsgrad': False, 'batch_size': 10, 'dropout_rate': 0.1, 'epochs': 50, 'learn_rate': 0.0001, 'neurons': 50}


test r2 score:0.3453264692566812


test RMSE score:0.02811095949452051
------------------------------------------------------------

In [22]:
gd_result = pd.DataFrame(grid.cv_results_)
gd_result=gd_result[['param_batch_size','param_epochs', 'param_neurons','param_activation','param_learn_rate', 'param_dropout_rate', 'mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]
gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

Unnamed: 0,param_batch_size,param_epochs,param_neurons,param_activation,param_learn_rate,param_dropout_rate,mean_test_r2,mean_test_neg_root_mean_squared_error
42,20,75,50,relu,0.0001,0.1,0.471183,-0.026268
7,10,75,100,relu,0.0001,0.1,0.460276,-0.026206
117,20,75,100,sigmoid,0.001,0.1,0.456469,-0.026634
43,20,75,100,relu,0.0001,0.1,0.450012,-0.026757
81,10,75,100,sigmoid,0.001,0.1,0.447835,-0.026724
111,20,50,100,sigmoid,0.001,0.1,0.443542,-0.026934
13,10,100,100,relu,0.0001,0.1,0.441177,-0.027102
85,10,100,100,sigmoid,0.0001,0.1,0.440743,-0.026952
49,20,100,100,relu,0.0001,0.1,0.439177,-0.027099
48,20,100,50,relu,0.0001,0.1,0.438428,-0.027007


In [23]:
gd_result.to_csv(output_folder+"/"+"gridsearch_NN_BERT_descriptiononly_5fold_bound.csv", index=False)

## Grid Search
### Random Forest, when only description data is used

In [17]:
import numpy as np
from sklearn import ensemble
GBR = ensemble.RandomForestRegressor()


parameters = {'bootstrap': [False], #True, 
              'max_depth': [30],  #5, 10, 20,, None
              'max_features': ['sqrt'], #'auto',
              'n_estimators': [500]} #32, 64, 100, 1000

y_train_flat=np.ravel(y_train)

from sklearn.model_selection import GridSearchCV
grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters,scoring=('r2', 'neg_root_mean_squared_error'),refit='r2', verbose=10, cv = 5, n_jobs=-1)
grid_GBR.fit(transformed_X_train, y_train_flat)

print("-------------------------------------------------------------------")
print("Gradient Boosting Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid_GBR.cv_results_['params']
r2_scores=grid_GBR.cv_results_['mean_test_r2']
rmse_scores=grid_GBR.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


df_gridsearch_result = pd.DataFrame(grid_GBR.cv_results_)


gd_result=df_gridsearch_result[['param_n_estimators','param_max_depth','param_max_features','param_bootstrap','mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]

Fitting 5 folds for each of 1 candidates, totalling 5 fits
-------------------------------------------------------------------
Gradient Boosting Grid Search Results
-------------------------------------------------------------------

 The best estimator across ALL searched params:
 RandomForestRegressor(bootstrap=False, max_depth=30, max_features='sqrt',
                      n_estimators=500)

 The best r2 score across ALL searched params:
 0.2842663400160824

 The best parameters across ALL searched params:
 {'bootstrap': False, 'max_depth': 30, 'max_features': 'sqrt', 'n_estimators': 500}


-------------------------------------------------------------------
All Results:
-------------------------------------------------------------------
parameter combinations:{'bootstrap': False, 'max_depth': 30, 'max_features': 'sqrt', 'n_estimators': 500}


test r2 score:0.2842663400160824


test RMSE score:0.03158261971519232
-------------------------------------------------------------------


In [18]:
gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

Unnamed: 0,param_n_estimators,param_max_depth,param_max_features,param_bootstrap,mean_test_r2,mean_test_neg_root_mean_squared_error
0,500,30,sqrt,False,0.284266,-0.031583


In [None]:
gd_result.to_csv(output_folder+"/"+"gridsearch_randomforest_BERT_descriptiononly_5fold_Lem.csv", index=False)

## Grid Search
### Gradient Boosting, using only description data

import numpy as np
from sklearn import ensemble
GBR = ensemble.GradientBoostingRegressor()

parameters = {'n_estimators' : [500,1000], # 100 removed
              'max_depth'    : [4,6], # 3 removed
                                       #'min_samples_split': [2, 5, 8],
              'learning_rate': [0.01,0.02], # 0.005 removed
                                     #'loss': ['ls'], # remove huber loss
              'subsample'    : [1, 0.8] 
             }

y_train_flat=np.ravel(y_train)

from sklearn.model_selection import GridSearchCV
grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters,scoring=('r2', 'neg_root_mean_squared_error'),refit='r2', verbose=10, cv = 5, n_jobs=-1)
grid_GBR.fit(transformed_X_train, y_train_flat)

print("-------------------------------------------------------------------")
print("Gradient Boosting Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid_GBR.cv_results_['params']
r2_scores=grid_GBR.cv_results_['mean_test_r2']
rmse_scores=grid_GBR.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


df_gridsearch_result = pd.DataFrame(grid_GBR.cv_results_)


gd_result=df_gridsearch_result[['param_n_estimators','param_max_depth','param_learning_rate','param_subsample','mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]

gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

gd_result.to_csv(output_folder+"/"+"gridsearch_gradientboosting_BERT_descriptiononly_5fold.csv", index=False)

## use only the description data to predict

## Check how long it takes to finish the cross-validation
import time
tic = time.perf_counter()

from sklearn.model_selection import cross_validate
scores = cross_validate(regressor, transformed_X_train, y_train, scoring=('r2', 'neg_root_mean_squared_error'), cv=10, return_train_score=True)

print("RMSE training Score using cv: {:0.5f}".format(scores['train_neg_root_mean_squared_error'].mean() * -1))

print("RMSE test Score using cv: {:0.5f}".format(scores['test_neg_root_mean_squared_error'].mean() * -1))

print("R2 training Score using cv: {:0.5f}".format(scores['train_r2'].mean()))

print("R2 test Score using cv: {:0.5f}".format(scores['test_r2'].mean()))

toc = time.perf_counter()
print(f"Finish cross validation in  {(toc - tic)/60:0.2f} minutes")

## Try to use all features to predict

## description word vectors


In [24]:
BERT=BertTransformer(tokenizer, bert_model)

transformed_X_train=BERT.transform(df["description"])

In [25]:
transformed_X_train=transformed_X_train.numpy()
df_desc=pd.DataFrame(transformed_X_train)

In [26]:
df_desc.shape

(10251, 768)

In [27]:
df_desc.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.484128,-0.233384,0.990137,-0.082705,0.607205,-0.423229,0.377716,0.344437,-0.00263,-0.604704,...,0.040921,-0.087954,-0.558054,-0.089476,0.255756,-0.243995,-0.561746,-0.450428,0.336876,-0.393164
1,-0.523328,-0.170666,1.065463,-0.206739,0.429469,0.0105,0.356779,0.492953,0.003819,-0.701718,...,-0.03326,0.191036,-0.480257,-0.36391,0.163412,-0.290906,-0.49853,-0.398669,0.423377,0.085806
2,-0.591282,0.13899,0.607158,-0.126244,0.423795,0.013943,0.605192,0.525164,-0.315374,-0.786659,...,0.370795,-0.083774,-0.365531,-0.251019,0.247597,-0.141465,-0.607626,-0.321752,-0.025957,0.425066
3,-0.499561,-0.059818,1.041867,-0.442185,0.512354,-0.115699,0.695242,0.066142,-0.140699,-0.540465,...,0.025969,0.223413,-0.151183,-0.387866,0.303887,-0.406554,-0.634158,-0.465336,0.431244,0.201161
4,0.070236,-0.173423,0.817306,-0.264483,0.454991,-0.283417,0.551238,0.100913,-0.46149,-0.38716,...,0.247772,-0.093069,-0.331828,-0.57485,-0.005,-0.219278,-0.379593,-0.287389,0.59845,0.15119


## numerical features

In [28]:
numerical_features=["bedroom","bedroomAboveGrade","bedroomBelowGrade","bathroom", "bathroomTotal","bathroomPartial", "totalParkingSpaces", "storeys", "maintenanceFees",  'landSize', 'longitude', "latitude"]

In [29]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_num=df[numerical_features]
X_num.head()

Unnamed: 0,bedroom,bedroomAboveGrade,bedroomBelowGrade,bathroom,bathroomTotal,bathroomPartial,totalParkingSpaces,storeys,maintenanceFees,landSize,longitude,latitude
0,5,4,1,4.0,4,0,4,2.0,670.51,2230.1475,-79.7953,43.7328
1,3,3,0,4.0,4,0,1,3.0,670.51,0.0,-79.7876,43.6249
2,7,4,3,4.0,4,0,8,2.0,670.51,4620.0,-79.7876,43.6249
3,6,4,2,4.0,4,0,4,2.0,670.51,5063.6808,-79.7876,43.6249
4,4,4,0,4.0,4,0,4,2.5,670.51,3169.0,-79.7876,43.6249


## Normalization for numerical data (exclude longitude and latitude) using MinMaxScaler


In [30]:
from sklearn.preprocessing import MinMaxScaler
# recaling the variables (both)
X_num_columns = X_num.columns
scaler = MinMaxScaler()
X_num = scaler.fit_transform(X_num)

# rename columns (since now its an np array)
X_num = pd.DataFrame(X_num)
X_num.columns = X_num_columns



## Normalization for longitude and latitude sepeparately

In [31]:
X_num.drop(["longitude", "latitude"], axis=1)
normed_long= df["longitude"] *0.01
normed_lat= df["latitude"] *0.01
X_num=pd.concat([X_num, normed_long, normed_lat], axis=1)

## Boolean features

In [32]:
boolean_features=['parkingAttachedGarage',
       'parkingUnderground', 'parkingInsideEntry', 'parkingSurfaced',
       'parkingOversize', 'parkingGravel', 'parkingGarage', 'parkingShared',
       'parkingDetachedGarage', 'parkingCarport', 'parkingInterlocked',
       'parkingVisitorParking','amenityClubhouse', 'amenityCarWash', 'amenityMusicRoom',
       'amenityStorageLocker', 'amenitySauna', 'amenityPartyRoom',
       'amenityRecreationCentre', 'amenityGuestSuite', 'amenityFurnished',
       'amenityLaundryFacility', 'amenityExerciseCentre',
       'amenityLaundryInSuite', 'amenitySecurity', 'amenityWhirlpool',
       'efinishWood', 'efinishBrick', 'efinishHardboard', 'efinishWoodsiding',
       'efinishLog', 'efinishMetal', 'efinishSteel', 'efinishStone',
       'efinishWoodshingles', 'efinishStucco', 'efinishSiding',
       'efinishConcrete', 'efinishShingles', 'efinishAluminumsiding',
       'efinishCedarshingles', 'efinishVinyl', 'efinishVinylsiding',
       'featurePetNotAllowed', 'AirportNearby',
       'GolfNearby', 'MarinaNearby', 'ShoppingNearby', 'WaterNearby',
       'WorshipPlaceNearby', 'RecreationNearby', 'PlaygroundNearby',
       'PublicTransitNearby', 'ParkNearby', 'SchoolsNearby', 'HospitalNearby',
       'HighwayNearby', 'SkiAreaNearby']

X_boo=df[boolean_features]

## Convert categorical data with string values into numerical values

In [33]:
X_category=df[['city', 'typeBuilding', 'title', 'styleAttach', 
       'cooling', 'basementType', 'basementFinish',
       'heatingType1', 'heatingType2', 'heatingEnergy1', 'heatingEnergy2', 'featureLotSlope', 'featureDriveway', 'featureLotPositionType',
       'featureOutdoorAreaType', 'featureOutdoorLandscape',
       'featureAdditionalFacility']]

In [34]:
## convert categorical data to numerical values
cate_features=['city', 'typeBuilding', 'title', 'styleAttach', 'cooling',  'basementType', 'basementFinish','heatingType1', 'heatingType2', 'heatingEnergy1', 'heatingEnergy2', 'featureLotSlope', 'featureDriveway', 'featureLotPositionType',
       'featureOutdoorAreaType', 'featureOutdoorLandscape',
       'featureAdditionalFacility']
for col in cate_features:
    X_category[col] = X_category[col].astype('category')
    X_category[col] = X_category[col].cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_category[col] = X_category[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_category[col] = X_category[col].cat.codes


In [35]:
X_category.head()

Unnamed: 0,city,typeBuilding,title,styleAttach,cooling,basementType,basementFinish,heatingType1,heatingType2,heatingEnergy1,heatingEnergy2,featureLotSlope,featureDriveway,featureLotPositionType,featureOutdoorAreaType,featureOutdoorLandscape,featureAdditionalFacility
0,0,3,3,3,1,4,0,3,0,2,0,3,4,3,1,3,3
1,0,8,2,0,1,4,3,3,0,2,0,3,4,3,1,3,3
2,0,3,3,1,1,4,0,3,0,2,0,3,4,3,1,3,3
3,0,3,3,1,1,4,0,3,0,2,0,3,4,3,1,3,3
4,0,3,3,1,1,3,3,3,0,2,0,3,4,3,1,3,3


## Normalize the categorical data 

In [36]:
from sklearn.preprocessing import MinMaxScaler

# recaling the variables (both)
X_category_columns = X_category.columns
scaler = MinMaxScaler()
X_category = scaler.fit_transform(X_category)

# rename columns (since now its an np array)
X_category = pd.DataFrame(X_category)
X_category.columns = X_category_columns

X_category.head()

Unnamed: 0,city,typeBuilding,title,styleAttach,cooling,basementType,basementFinish,heatingType1,heatingType2,heatingEnergy1,heatingEnergy2,featureLotSlope,featureDriveway,featureLotPositionType,featureOutdoorAreaType,featureOutdoorLandscape,featureAdditionalFacility
0,0.0,0.3,0.6,0.428571,0.2,0.8,0.0,0.428571,0.0,0.4,0.0,0.6,0.666667,1.0,0.333333,0.272727,0.5
1,0.0,0.8,0.4,0.0,0.2,0.8,1.0,0.428571,0.0,0.4,0.0,0.6,0.666667,1.0,0.333333,0.272727,0.5
2,0.0,0.3,0.6,0.142857,0.2,0.8,0.0,0.428571,0.0,0.4,0.0,0.6,0.666667,1.0,0.333333,0.272727,0.5
3,0.0,0.3,0.6,0.142857,0.2,0.8,0.0,0.428571,0.0,0.4,0.0,0.6,0.666667,1.0,0.333333,0.272727,0.5
4,0.0,0.3,0.6,0.142857,0.2,0.6,1.0,0.428571,0.0,0.4,0.0,0.6,0.666667,1.0,0.333333,0.272727,0.5


## Use numerical, boolean, categorical, and description data to predict

In [37]:
X_all = pd.concat([X_num, X_boo, X_category, df_desc], axis=1)
X_all.head()

Unnamed: 0,bedroom,bedroomAboveGrade,bedroomBelowGrade,bathroom,bathroomTotal,bathroomPartial,totalParkingSpaces,storeys,maintenanceFees,landSize,...,758,759,760,761,762,763,764,765,766,767
0,0.277778,0.444444,0.111111,0.16,0.16,0.0,0.015385,0.5,0.068565,5.5e-05,...,0.040921,-0.087954,-0.558054,-0.089476,0.255756,-0.243995,-0.561746,-0.450428,0.336876,-0.393164
1,0.166667,0.333333,0.0,0.16,0.16,0.0,0.003846,0.75,0.068565,0.0,...,-0.03326,0.191036,-0.480257,-0.36391,0.163412,-0.290906,-0.49853,-0.398669,0.423377,0.085806
2,0.388889,0.444444,0.333333,0.16,0.16,0.0,0.030769,0.5,0.068565,0.000114,...,0.370795,-0.083774,-0.365531,-0.251019,0.247597,-0.141465,-0.607626,-0.321752,-0.025957,0.425066
3,0.333333,0.444444,0.222222,0.16,0.16,0.0,0.015385,0.5,0.068565,0.000125,...,0.025969,0.223413,-0.151183,-0.387866,0.303887,-0.406554,-0.634158,-0.465336,0.431244,0.201161
4,0.222222,0.444444,0.0,0.16,0.16,0.0,0.015385,0.625,0.068565,7.8e-05,...,0.247772,-0.093069,-0.331828,-0.57485,-0.005,-0.219278,-0.379593,-0.287389,0.59845,0.15119


## divide dataset

In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    X_all, df["price"], test_size=0.1, random_state=13) 

In [39]:
X_train = np.asarray(X_train).astype(np.float32)
#X_test = np.asarray(X_test).astype(np.float32)

## Grid Search
### Long Short Term Memory, when only description data is used

#### Function to create model, required for KerasClassifier

In [40]:
import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD

def create_model(learn_rate=0.001, amsgrad=False, activation='relu', dropout_rate=0.0, neurons=50):
    # create model
    # The maximum number of words to be used. (most frequent)
    #MAX_NB_WORDS = 50000
    # embedding dimension
    #EMBEDDING_DIM = 100
    #model.add(Dense(1024, activation='relu', input_shape=(X_train.shape[1],)))
    #model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    #model.add(LSTM(50))

    model = Sequential()
    model.add(Dense(neurons, activation=activation)) #input_shape=(X_train.shape[1],), return_sequences = True
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons*2, activation=activation))
    model.add(Dropout(dropout_rate/2))
    model.add(Dense(neurons, activation=activation))

    
    model.add(Dense(1, activation='sigmoid')) #

    # Compile model
    optimizer = Adam(learning_rate=learn_rate, amsgrad=amsgrad)
    model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])
    return model

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# split into input (X) and output (Y) variables
# create model
model = KerasRegressor(build_fn=create_model, verbose=10) #epochs=75, batch_size=10, verbose=10)
# define the grid search parameters
#optimizer = ['Adam'] 
batch_size = [10,20] # 5, 
epochs = [50, 75, 100] # ,
learn_rate = [0.0001,0.001,0.01] #0.0001, , 0.01
amsgrad = [False] # True,  #True,
activation = ['relu', 'sigmoid']#, 'softplus'] #, 'sigmoid','softplus'] #, , 'softsign', 'hard_sigmoid', 'softmax', #, 'linear' 
dropout_rate = [0.1,0.2] #,0.3]#, 0.2] #0.0,, 0.3, 0.5 0.4, 0.2,, 0.3, 0.4, 0.5, 0.7
neurons = [50, 100] #25, 50, 100, 150,300, 200


param_grid = dict(batch_size=batch_size, epochs=epochs, learn_rate=learn_rate, amsgrad=amsgrad, activation=activation, dropout_rate=dropout_rate, neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5, verbose=10, scoring=('r2', 'neg_root_mean_squared_error'), refit='r2')
grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 5 folds for each of 144 candidates, totalling 720 fits
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/1

In [41]:
print("-------------------------------------------------------------------")
print("Neural Network Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid.best_score_)
print("\n The best parameters across ALL searched params:\n",grid.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid.cv_results_['params']
r2_scores=grid.cv_results_['mean_test_r2']
rmse_scores=grid.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


-------------------------------------------------------------------
Neural Network Grid Search Results
-------------------------------------------------------------------

 The best estimator across ALL searched params:
 <tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor object at 0x000001E7159649A0>

 The best r2 score across ALL searched params:
 0.599923801592848

 The best parameters across ALL searched params:
 {'activation': 'sigmoid', 'amsgrad': False, 'batch_size': 10, 'dropout_rate': 0.1, 'epochs': 100, 'learn_rate': 0.001, 'neurons': 100}


-------------------------------------------------------------------
All Results:
-------------------------------------------------------------------
parameter combinations:{'activation': 'relu', 'amsgrad': False, 'batch_size': 10, 'dropout_rate': 0.1, 'epochs': 50, 'learn_rate': 0.0001, 'neurons': 50}


test r2 score:0.504034729329053


test RMSE score:0.024728939252869276
--------------------------------------------------------

In [42]:
gd_result = pd.DataFrame(grid.cv_results_)
gd_result=gd_result[['param_batch_size','param_epochs', 'param_neurons','param_activation','param_learn_rate', 'param_dropout_rate', 'mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]
gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

Unnamed: 0,param_batch_size,param_epochs,param_neurons,param_activation,param_learn_rate,param_dropout_rate,mean_test_r2,mean_test_neg_root_mean_squared_error
87,10,100,100,sigmoid,0.001,0.1,0.599924,-0.022817
140,20,100,50,sigmoid,0.001,0.2,0.594718,-0.023028
110,20,50,50,sigmoid,0.001,0.1,0.591222,-0.022969
7,10,75,100,relu,0.0001,0.1,0.589087,-0.022863
43,20,75,100,relu,0.0001,0.1,0.582346,-0.023225
48,20,100,50,relu,0.0001,0.1,0.580031,-0.023348
1,10,50,100,relu,0.0001,0.1,0.577454,-0.0233
117,20,75,100,sigmoid,0.001,0.1,0.572924,-0.023103
123,20,100,100,sigmoid,0.001,0.1,0.572658,-0.023714
49,20,100,100,relu,0.0001,0.1,0.572616,-0.02307


In [43]:
gd_result.to_csv(output_folder+"/"+"gridsearch_NN_BERT_all_5fold_bound.csv", index=False)

## Grid Search
### Random Forest, when all features are used

In [32]:
import numpy as np
from sklearn import ensemble
GBR = ensemble.RandomForestRegressor()


parameters = {'bootstrap': [True, False],
              'max_depth': [10, 20, 30, None], #5
              'max_features': ['auto', 'sqrt'],
              'n_estimators': [32, 64, 100, 500]} #1000

y_train_flat=np.ravel(y_train)

from sklearn.model_selection import GridSearchCV
grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters,scoring=('r2', 'neg_root_mean_squared_error'),refit='r2', verbose=10, cv = 5, n_jobs=-1)
grid_GBR.fit(X_train, y_train_flat)

print("-------------------------------------------------------------------")
print("Gradient Boosting Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid_GBR.cv_results_['params']
r2_scores=grid_GBR.cv_results_['mean_test_r2']
rmse_scores=grid_GBR.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


df_gridsearch_result = pd.DataFrame(grid_GBR.cv_results_)


gd_result=df_gridsearch_result[['param_n_estimators','param_max_depth','param_max_features','param_bootstrap','mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]

Fitting 5 folds for each of 64 candidates, totalling 320 fits
-------------------------------------------------------------------
Gradient Boosting Grid Search Results
-------------------------------------------------------------------

 The best estimator across ALL searched params:
 RandomForestRegressor(max_depth=30, n_estimators=64)

 The best r2 score across ALL searched params:
 0.5633821504273518

 The best parameters across ALL searched params:
 {'bootstrap': True, 'max_depth': 30, 'max_features': 'auto', 'n_estimators': 64}


-------------------------------------------------------------------
All Results:
-------------------------------------------------------------------
parameter combinations:{'bootstrap': True, 'max_depth': 10, 'max_features': 'auto', 'n_estimators': 32}


test r2 score:0.49253986410132916


test RMSE score:0.026595481110528896
-------------------------------------------------------------------
parameter combinations:{'bootstrap': True, 'max_depth': 10, 'ma

In [33]:
gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

Unnamed: 0,param_n_estimators,param_max_depth,param_max_features,param_bootstrap,mean_test_r2,mean_test_neg_root_mean_squared_error
17,64,30.0,auto,True,0.563382,-0.024582
10,100,20.0,auto,True,0.559358,-0.024822
27,500,,auto,True,0.55854,-0.02481
63,500,,sqrt,False,0.557533,-0.024959
47,500,20.0,sqrt,False,0.557304,-0.024938
11,500,20.0,auto,True,0.556558,-0.024891
19,500,30.0,auto,True,0.553887,-0.024981
62,100,,sqrt,False,0.550845,-0.025209
55,500,30.0,sqrt,False,0.549361,-0.02521
53,64,30.0,sqrt,False,0.547385,-0.025172


In [34]:
gd_result.to_csv(output_folder+"/"+"gridsearch_randomforest_BERT_all_5fold.csv", index=False)

## Grid Search
### Gradient Boosting, using all data

In [33]:
import numpy as np
from sklearn import ensemble
GBR = ensemble.GradientBoostingRegressor()

parameters = {'n_estimators' : [500,1000], # 100 removed
              'max_depth'    : [4,6], # 3 removed
                                       #'min_samples_split': [2, 5, 8],
              'learning_rate': [0.01,0.02], # 0.005 removed
                                     #'loss': ['ls'], # remove huber loss
              'subsample'    : [1, 0.8] 
             }

y_train_flat=np.ravel(y_train)

from sklearn.model_selection import GridSearchCV
grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters,scoring=('r2', 'neg_root_mean_squared_error'),refit='r2', verbose=10, cv = 5) #, n_jobs=-1
grid_GBR.fit(X_train, y_train_flat)

print("-------------------------------------------------------------------")
print("Gradient Boosting Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid_GBR.cv_results_['params']
r2_scores=grid_GBR.cv_results_['mean_test_r2']
rmse_scores=grid_GBR.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


df_gridsearch_result = pd.DataFrame(grid_GBR.cv_results_)


gd_result=df_gridsearch_result[['param_n_estimators','param_max_depth','param_learning_rate','param_subsample','mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5; 1/16] START learning_rate=0.01, max_depth=4, n_estimators=500, subsample=1
[CV 1/5; 1/16] END learning_rate=0.01, max_depth=4, n_estimators=500, subsample=1; neg_root_mean_squared_error: (test=-0.018) r2: (test=0.654) total time=13.9min
[CV 2/5; 1/16] START learning_rate=0.01, max_depth=4, n_estimators=500, subsample=1
[CV 2/5; 1/16] END learning_rate=0.01, max_depth=4, n_estimators=500, subsample=1; neg_root_mean_squared_error: (test=-0.028) r2: (test=0.635) total time=13.7min
[CV 3/5; 1/16] START learning_rate=0.01, max_depth=4, n_estimators=500, subsample=1
[CV 3/5; 1/16] END learning_rate=0.01, max_depth=4, n_estimators=500, subsample=1; neg_root_mean_squared_error: (test=-0.020) r2: (test=0.466) total time=13.3min
[CV 4/5; 1/16] START learning_rate=0.01, max_depth=4, n_estimators=500, subsample=1
[CV 4/5; 1/16] END learning_rate=0.01, max_depth=4, n_estimators=500, subsample=1; neg_root_mean_squared_error: (test

[CV 4/5; 7/16] END learning_rate=0.01, max_depth=6, n_estimators=1000, subsample=1; neg_root_mean_squared_error: (test=-0.028) r2: (test=0.649) total time=38.8min
[CV 5/5; 7/16] START learning_rate=0.01, max_depth=6, n_estimators=1000, subsample=1
[CV 5/5; 7/16] END learning_rate=0.01, max_depth=6, n_estimators=1000, subsample=1; neg_root_mean_squared_error: (test=-0.019) r2: (test=0.739) total time=38.6min
[CV 1/5; 8/16] START learning_rate=0.01, max_depth=6, n_estimators=1000, subsample=0.8
[CV 1/5; 8/16] END learning_rate=0.01, max_depth=6, n_estimators=1000, subsample=0.8; neg_root_mean_squared_error: (test=-0.019) r2: (test=0.636) total time=30.6min
[CV 2/5; 8/16] START learning_rate=0.01, max_depth=6, n_estimators=1000, subsample=0.8
[CV 2/5; 8/16] END learning_rate=0.01, max_depth=6, n_estimators=1000, subsample=0.8; neg_root_mean_squared_error: (test=-0.027) r2: (test=0.656) total time=31.0min
[CV 3/5; 8/16] START learning_rate=0.01, max_depth=6, n_estimators=1000, subsample=0.

[CV 2/5; 14/16] END learning_rate=0.02, max_depth=6, n_estimators=500, subsample=0.8; neg_root_mean_squared_error: (test=-0.026) r2: (test=0.672) total time=15.4min
[CV 3/5; 14/16] START learning_rate=0.02, max_depth=6, n_estimators=500, subsample=0.8
[CV 3/5; 14/16] END learning_rate=0.02, max_depth=6, n_estimators=500, subsample=0.8; neg_root_mean_squared_error: (test=-0.019) r2: (test=0.502) total time=15.4min
[CV 4/5; 14/16] START learning_rate=0.02, max_depth=6, n_estimators=500, subsample=0.8
[CV 4/5; 14/16] END learning_rate=0.02, max_depth=6, n_estimators=500, subsample=0.8; neg_root_mean_squared_error: (test=-0.026) r2: (test=0.699) total time=15.5min
[CV 5/5; 14/16] START learning_rate=0.02, max_depth=6, n_estimators=500, subsample=0.8
[CV 5/5; 14/16] END learning_rate=0.02, max_depth=6, n_estimators=500, subsample=0.8; neg_root_mean_squared_error: (test=-0.020) r2: (test=0.706) total time=15.5min
[CV 1/5; 15/16] START learning_rate=0.02, max_depth=6, n_estimators=1000, subsa

In [34]:
gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

Unnamed: 0,param_n_estimators,param_max_depth,param_learning_rate,param_subsample,mean_test_r2,mean_test_neg_root_mean_squared_error
11,1000,4,0.02,0.8,0.670336,-0.021246
10,1000,4,0.02,1.0,0.66394,-0.021484
2,1000,4,0.01,1.0,0.663903,-0.021423
8,500,4,0.02,1.0,0.661582,-0.021608
3,1000,4,0.01,0.8,0.660426,-0.021455
7,1000,6,0.01,0.8,0.653139,-0.021775
13,500,6,0.02,0.8,0.647607,-0.022034
9,500,4,0.02,0.8,0.644412,-0.021997
15,1000,6,0.02,0.8,0.640238,-0.022225
0,500,4,0.01,1.0,0.629464,-0.022587


In [35]:
gd_result.to_csv(output_folder+"/"+"gridsearch_gradientboosting_BERT_all_5fold.csv", index=False)

## create gradient boosting model

In [29]:
import numpy as np 
params = {'n_estimators': 1000,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}

reg = ensemble.GradientBoostingRegressor(**params)#**params

y_train_flat=np.ravel(y_train)
reg.fit(X_train, y_train_flat)

GradientBoostingRegressor(learning_rate=0.01, max_depth=4, min_samples_split=5,
                          n_estimators=1000)

## do cross validation

In [30]:
import time

tic = time.perf_counter()

In [31]:
from sklearn.model_selection import cross_validate
scores = cross_validate(reg, X_train, y_train_flat, scoring=('r2', 'neg_root_mean_squared_error'), cv=10, return_train_score=True)

In [32]:
print("RMSE training Score using cv: {:0.5f}".format(scores['train_neg_root_mean_squared_error'].mean() * -1))

RMSE training Score using cv: 0.00769


In [33]:
print("RMSE test Score using cv: {:0.5f}".format(scores['test_neg_root_mean_squared_error'].mean() * -1))

RMSE test Score using cv: 0.02072


In [34]:
print("R2 training Score using cv: {:0.5f}".format(scores['train_r2'].mean() * -1))

R2 training Score using cv: -0.96042


In [35]:
print("R2 test Score using cv: {:0.5f}".format(scores['test_r2'].mean() * -1))

R2 test Score using cv: -0.67463


In [36]:
toc = time.perf_counter()
print(f"Finish cross validation in  {(toc - tic)/60:0.4f} minutes")

Finish cross validation in  305.4617 minutes
