## Read the data where the description has been processed (lemmatized, stop word not removed, no stemming)

In [None]:
import pandas as pd
import os

project_folder="C:/Users/muhammadkashifkhan/Documents/ASDS_2nd/Thesis/output_kashif/"
output_folder=project_folder+"output"

df=pd.read_csv(output_folder+"/"+"all_after_preprocessingLem.csv")

In [None]:
len(df)

In [None]:
df.head()

In [None]:
df['description'] = df['description'].astype(str)


In [None]:
df['description'].dtype

In [None]:
df['description']

## tokenization again

In [None]:
from nltk.tokenize import RegexpTokenizer
tokeniser = RegexpTokenizer(r'\w+')
df["description"]=df["description"].apply(lambda x: tokeniser.tokenize(x))

## Remove Stop Words

from nltk.corpus import stopwords
stopwords_nltk=stopwords.words('english')

real_estate_stopwords = [
    #"area",
    "province",
    #"location",
    #"plot",
    ## common measurement
    #"hectare",
    #"acre",
    #"m2",
    #"sq",
    #"sale",
    #"square",
    #"meter",
    #"metre",
    #"feet",
    #"foot",
    ## common rooms
    #"room",
    #"bedroom",
    #"bathroom",
    #"bath",
    #"washroom",
    #"dining",
    #"living",
    #"kitchen",
    
    #"hallway",
    #"corridor",
    
    ## common occurance
    "extra"
    
    ## type of the building
    #"apartment",
    #"condo",
    #"condominium",
    #"home",
    #"house",
    #"unit",
    ## describe the appliances, too common
    #"stainless",
    #"steel",
    ## common appliances
    #"washer",
    #"dryer",
    #"stove",
    #"fridge"
    ]

all_stop_words=stopwords_nltk+real_estate_stopwords

df["description"]=df["description"].apply(lambda x: [word for word in x if word not in stopwords_nltk] )

## check unique words

In [None]:
df["description"]

In [None]:
df["description"]=df["description"].apply(lambda x: " ".join(word for word in x))

In [None]:
# Unique words
uniqueWords = list(set(" ".join(df['description']).split(" ")))
count = len(uniqueWords)
print("Number of unique words is: "+str(count))

## check n-gram, and word cloud

## check 1-gram

In [None]:
import nltk
## This is the list of all the words in the description column
totalWords = list(" ".join(df['description']).split(" "))

(pd.Series(nltk.ngrams(totalWords, 1)).value_counts())[:20]

## check 2-gram

In [None]:
import nltk
(pd.Series(nltk.ngrams(totalWords, 2)).value_counts())[:20]

## check 3-gram

In [None]:
(pd.Series(nltk.ngrams(totalWords, 3)).value_counts())[:20]

## check 4-gram

In [None]:
(pd.Series(nltk.ngrams(totalWords, 4)).value_counts())[:20]

## word cloud for all description data

In [None]:
text = " ".join(des for des in df.description)
print ("There are {} words in the combination of all description.".format(len(text)))


In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Generate a word cloud image
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)#max_font_size=100, max_words=500,

# Display the generated image:
# the matplotlib way:
plt.figure(figsize = (50, 50))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


In [None]:
df.price.describe()

## word cloud for cheap 25% listings

In [None]:
df_5cheapest=df.sort_values(by=["price"])[:516]

In [None]:
text = " ".join(des for des in df_5cheapest.description)
print ("There are {} words in the combination of all description.".format(len(text)))

# Generate a word cloud image
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)#max_font_size=100, max_words=500,

# Display the generated image:
# the matplotlib way:
plt.figure(figsize = (50, 50))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


## word cloud for middle price listings

In [None]:
df_100middle=df.sort_values(by=["price"])[5000:6001]

In [None]:
text = " ".join(des for des in df_100middle.description)
print ("There are {} words in the combination of all description.".format(len(text)))

# Generate a word cloud image
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)#max_font_size=100, max_words=500,

# Display the generated image:
# the matplotlib way:
plt.figure(figsize = (100, 100))
plt.imshow(wordcloud, interpolation='None')
plt.axis("off")
plt.show()


## word cloud for most expensive 25%

In [None]:
df_5highest=df.sort_values(by=["price"], ascending=False)[:516]

In [None]:
text = " ".join(des for des in df_5highest.description)
print ("There are {} words in the combination of all description.".format(len(text)))

# Generate a word cloud image
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)#max_font_size=100, max_words=500,

# Display the generated image:
# the matplotlib way:
plt.figure(figsize = (100, 100))
plt.imshow(wordcloud, interpolation='None')
plt.axis("off")
plt.show()


## Word2Vec

In [None]:
from gensim.models import Word2Vec

## Convert description words to vectors using pre-trained google news word2vec

## check the list of available pre-trained word vector models

import gensim.downloader as api

print(list(api.info()['models'].keys()))

## load pre-trained model

model = api.load("glove-twitter-200")
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

## Or, train the word2vec using description data

## create list of sentences(sentence contain list of words)

import gensim
corpus = df["description"]

## create list of lists of unigrams
lst_corpus = []
for string in corpus:
    lst_words = string.split()
    lst_grams = [" ".join(lst_words[i:i+1]) 
               for i in range(0, len(lst_words), 1)]
    lst_corpus.append(lst_grams)

## detect bigrams and trigrams
bigrams_detector = gensim.models.phrases.Phrases(lst_corpus, 
                 delimiter=" ".encode(), min_count=5, threshold=10)
bigrams_detector = gensim.models.phrases.Phraser(bigrams_detector)
trigrams_detector = gensim.models.phrases.Phrases(bigrams_detector[lst_corpus], 
            delimiter=" ".encode(), min_count=5, threshold=10)
trigrams_detector = gensim.models.phrases.Phraser(trigrams_detector)


len(lst_corpus)

## train a Word2Vec model from scratch

import gensim
# let X be a list of tokenized texts (i.e. list of lists of tokens)
model = gensim.models.Word2Vec(lst_corpus, size=300, window=8, min_count=1, sg=1, iter=30)
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

model.save(output_folder+"/w2v.model")

In [None]:
from gensim.models import Word2Vec
model=Word2Vec.load(output_folder+"/w2v.model")
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

## define the word2vec vectorizer

In [None]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))
        

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0) for words in X])

In [None]:
len(next(iter(w2v.values())))

## tokenize

In [None]:
from nltk.tokenize import RegexpTokenizer
tokeniser = RegexpTokenizer(r'\w+')
df["description"]=df["description"].apply(lambda x: tokeniser.tokenize(x))

In [None]:
listt=df.description.head()
for words in listt:
    print(words)
    for w in words:
        print(w)
        if w in w2v:
            print("yes it is in word2vec")
        else:
            print("no it is not")
        print("----")

## Get price and normalize price

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
labels_price = pd.DataFrame(scaler.fit_transform(pd.DataFrame(df["price"])))

In [None]:
labels_price.shape

## Try use only description data to predict

## divide dataset into training data and testing data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df["description"], labels_price, test_size=0.1, random_state=46) #random state=13 originally

## build the pipeline model with gradient boosting

In [None]:
import numpy as np 
from sklearn import ensemble


params = {'n_estimators': 1000,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}

reg = ensemble.GradientBoostingRegressor(**params)#**params
y_train_flat=np.ravel(y_train)

from sklearn.pipeline import Pipeline
graboo_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("gradient boosting", reg)])

In [None]:
## convert X_train to numpy, might be not necessary
X_train=X_train.values

## check what the transformed vectors look like

In [None]:
X_train.shape

In [None]:
MEV=MeanEmbeddingVectorizer(w2v)
transformed_X_train=MEV.transform(X_train)

In [None]:
transformed_X_train

In [None]:
transformed_X_train.shape

## Grid Search
### Long Short Term Memory, when only description data is used

#### Function to create model, required for KerasClassifier

In [None]:
import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD

def create_model(learn_rate=0.001, amsgrad=False, activation='relu', dropout_rate=0.0, neurons=50):
    # create model
    # The maximum number of words to be used. (most frequent)
    #MAX_NB_WORDS = 50000
    # embedding dimension
    #EMBEDDING_DIM = 100
    #model.add(Dense(1024, activation='relu', input_shape=(X_train.shape[1],)))
    #model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    #model.add(LSTM(50))

    model = Sequential()
    model.add(Dense(neurons, activation=activation)) #input_shape=(X_train.shape[1],), return_sequences = True
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons*2, activation=activation))
    model.add(Dropout(dropout_rate/2))
    model.add(Dense(neurons, activation=activation))

    
    model.add(Dense(1,activation ='sigmoid')) #, activation='sigmoid'

    # Compile model
    optimizer = Adam(learning_rate=learn_rate, amsgrad=amsgrad)
    model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])
    return model

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# split into input (X) and output (Y) variables
# create model
model = KerasRegressor(build_fn=create_model, verbose=10) #epochs=75, batch_size=10, verbose=10)
# define the grid search parameters
#optimizer = ['Adam'] 
batch_size = [10,20] # 5, 
epochs = [50, 75, 100] # ,
learn_rate = [0.0001,0.001,0.01] #0.0001, , 0.01
amsgrad = [False] # True,  #True,
activation = ['relu', 'sigmoid']#, 'softplus'] #, 'sigmoid','softplus'] #, , 'softsign', 'hard_sigmoid', 'softmax', #, 'linear' 
dropout_rate = [0.1,0.2] #,0.3]#, 0.2] #0.0,, 0.3, 0.5 0.4, 0.2,, 0.3, 0.4, 0.5, 0.7
neurons = [50, 100] #25, 50, 100, 150,300, 200


param_grid = dict(batch_size=batch_size, epochs=epochs, learn_rate=learn_rate, amsgrad=amsgrad, activation=activation, dropout_rate=dropout_rate, neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5, verbose=10, scoring=('r2', 'neg_root_mean_squared_error'), refit='r2')
grid_result = grid.fit(transformed_X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
print("-------------------------------------------------------------------")
print("Neural Network Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid.best_score_)
print("\n The best parameters across ALL searched params:\n",grid.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid.cv_results_['params']
r2_scores=grid.cv_results_['mean_test_r2']
rmse_scores=grid.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


In [None]:
gd_result = pd.DataFrame(grid.cv_results_)
gd_result=gd_result[['param_batch_size','param_epochs', 'param_neurons','param_activation','param_learn_rate', 'param_dropout_rate', 'mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]
gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

In [None]:
gd_result.to_csv(output_folder+"/"+"gridsearch_NN_w2v_descriptiononly_5fold_corrected.csv", index=False)

## Grid Search
### Random Forest, when only description data is used

In [None]:
import numpy as np
from sklearn import ensemble
GBR = ensemble.RandomForestRegressor()


parameters = {'bootstrap': [True, False],
              'max_depth': [5, 10, 20, 30, None],
              'max_features': ['auto', 'sqrt'],
              'n_estimators': [32, 64, 100, 500, 1000]}

y_train_flat=np.ravel(y_train)

from sklearn.model_selection import GridSearchCV
grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters,scoring=('r2', 'neg_root_mean_squared_error'),refit='r2', verbose=10, cv = 5, n_jobs=-1)
grid_GBR.fit(transformed_X_train, y_train_flat)

print("-------------------------------------------------------------------")
print("Gradient Boosting Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid_GBR.cv_results_['params']
r2_scores=grid_GBR.cv_results_['mean_test_r2']
rmse_scores=grid_GBR.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


df_gridsearch_result = pd.DataFrame(grid_GBR.cv_results_)


gd_result=df_gridsearch_result[['param_n_estimators','param_max_depth','param_max_features','param_bootstrap','mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]

In [None]:
gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

In [None]:
gd_result.to_csv(output_folder+"/"+"gridsearch_randomforest_w2v_selftrained_descriptiononly_5fold.csv", index=False)

## Grid Search
### Gradient Boosting, when only description data is used

In [None]:
import numpy as np
from sklearn import ensemble
GBR = ensemble.GradientBoostingRegressor()

parameters = {'n_estimators' : [500,1000], # 100 removed
              'max_depth'    : [4,6], # 3 removed
                                       #'min_samples_split': [2, 5, 8],
              'learning_rate': [0.01,0.02], # 0.005 removed
                                     #'loss': ['ls'], # remove huber loss
              'subsample'    : [1, 0.8] 
             }

y_train_flat=np.ravel(y_train)

from sklearn.model_selection import GridSearchCV
grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters,scoring=('r2', 'neg_root_mean_squared_error'),refit='r2', verbose=10, cv = 10, n_jobs=-1)
grid_GBR.fit(transformed_X_train, y_train_flat)

print("-------------------------------------------------------------------")
print("Gradient Boosting Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid_GBR.cv_results_['params']
r2_scores=grid_GBR.cv_results_['mean_test_r2']
rmse_scores=grid_GBR.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


df_gridsearch_result = pd.DataFrame(grid_GBR.cv_results_)

In [None]:
gd_result=df_gridsearch_result[['param_n_estimators','param_max_depth','param_learning_rate','param_subsample','mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]#'param_min_samples_split','param_loss',

In [None]:
gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

In [None]:
gd_result.to_csv(output_folder+"/"+"gridsearch_gradientboosting_W2V_self_trained_descriptiononly_5fold.csv", index=False)

## Cross Validation

In [None]:
## Check how long it takes to finish the cross-validation
import time
tic = time.perf_counter()

In [None]:
from sklearn.model_selection import cross_validate
scores = cross_validate(graboo_w2v, X_train, y_train_flat, scoring=('r2', 'neg_root_mean_squared_error'), cv=10, return_train_score=True)

In [None]:
print("RMSE training Score using cv: {:0.5f}".format(scores['train_neg_root_mean_squared_error'].mean() * -1))

In [None]:
print("RMSE test Score using cv: {:0.5f}".format(scores['test_neg_root_mean_squared_error'].mean() * -1))

In [None]:
print("R2 training Score using cv: {:0.5f}".format(scores['train_r2'].mean() * -1))

In [None]:
print("R2 test Score using cv: {:0.5f}".format(scores['test_r2'].mean() * -1))

In [None]:
toc = time.perf_counter()
print(f"Finish cross validation in  {(toc - tic)/60:0.2f} minutes")

## Try to use all features to predict

## description word vectors


In [None]:
import numpy as np
MEV=MeanEmbeddingVectorizer(w2v)
transformed_X_train=MEV.transform(df["description"])

In [None]:
df_desc=pd.DataFrame(transformed_X_train)

In [None]:
df_desc.shape

In [None]:
df_desc.head()

## numerical features

In [None]:
numerical_features=["bedroom","baths", 'Size', 'longitude', "latitude"]

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_num=df[numerical_features]
X_num.head()

## Normalization for numerical data (exclude longitude and latitude) using MinMaxScaler


In [None]:
from sklearn.preprocessing import MinMaxScaler
# recaling the variables (both)
X_num_columns = X_num.columns
scaler = MinMaxScaler()
X_num = scaler.fit_transform(X_num)

# rename columns (since now its an np array)
X_num = pd.DataFrame(X_num)
X_num.columns = X_num_columns



## Normalization for longitude and latitude sepeparately

In [None]:
X_num.drop(["longitude", "latitude"], axis=1)
normed_long= df["longitude"] *0.01
normed_lat= df["latitude"] *0.01
X_num=pd.concat([X_num, normed_long, normed_lat], axis=1)

## Boolean features

In [None]:
boolean_features=['parkingAttachedGarage',
       'parkingUnderground', 'parkingInsideEntry', 'parkingSurfaced',
       'parkingOversize', 'parkingGravel', 'parkingGarage', 'parkingShared',
       'parkingDetachedGarage', 'parkingCarport', 'parkingInterlocked',
       'parkingVisitorParking','amenityClubhouse', 'amenityCarWash', 'amenityMusicRoom',
       'amenityStorageLocker', 'amenitySauna', 'amenityPartyRoom',
       'amenityRecreationCentre', 'amenityGuestSuite', 'amenityFurnished',
       'amenityLaundryFacility', 'amenityExerciseCentre',
       'amenityLaundryInSuite', 'amenitySecurity', 'amenityWhirlpool',
       'efinishWood', 'efinishBrick', 'efinishHardboard', 'efinishWoodsiding',
       'efinishLog', 'efinishMetal', 'efinishSteel', 'efinishStone',
       'efinishWoodshingles', 'efinishStucco', 'efinishSiding',
       'efinishConcrete', 'efinishShingles', 'efinishAluminumsiding',
       'efinishCedarshingles', 'efinishVinyl', 'efinishVinylsiding',
       'featurePetNotAllowed', 'AirportNearby',
       'GolfNearby', 'MarinaNearby', 'ShoppingNearby', 'WaterNearby',
       'WorshipPlaceNearby', 'RecreationNearby', 'PlaygroundNearby',
       'PublicTransitNearby', 'ParkNearby', 'SchoolsNearby', 'HospitalNearby',
       'HighwayNearby', 'SkiAreaNearby']

X_boo=df[boolean_features]

## Convert categorical data with string values into numerical values

In [None]:
X_category=df[['location']]

In [None]:
## convert categorical data to numerical values
cate_features=['location']
for col in cate_features:
    X_category[col] = X_category[col].astype('category')
    X_category[col] = X_category[col].cat.codes

In [None]:
X_category.head()

## Normalize the categorical data 

In [None]:
from sklearn.preprocessing import MinMaxScaler

# recaling the variables (both)
X_category_columns = X_category.columns
scaler = MinMaxScaler()
X_category = scaler.fit_transform(X_category)

# rename columns (since now its an np array)
X_category = pd.DataFrame(X_category)
X_category.columns = X_category_columns

X_category.head()

## Use numerical, boolean, categorical, and description data to predict

In [None]:
X_all = pd.concat([X_num, X_boo, X_category, df_desc], axis=1)
X_all.head()

## divide dataset

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_all, labels_price, test_size=0.1, random_state=13) 

In [None]:
X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)

In [None]:
X_train.shape

## Grid Search
### Long Short Term Memory, when all features are used

#### Function to create model, required for KerasClassifier

In [None]:
import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD

def create_model(learn_rate=0.001, amsgrad=False, activation='relu', dropout_rate=0.0, neurons=50):
    # create model
    # The maximum number of words to be used. (most frequent)
    #MAX_NB_WORDS = 50000
    # embedding dimension
    #EMBEDDING_DIM = 100
    #model.add(Dense(1024, activation='relu', input_shape=(X_train.shape[1],)))
    #model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    #model.add(LSTM(50))

    model = Sequential()
    model.add(Dense(neurons, activation=activation)) #input_shape=(X_train.shape[1],), return_sequences = True
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons*2, activation=activation))
    model.add(Dropout(dropout_rate/2))
    model.add(Dense(neurons, activation=activation))

    
    model.add(Dense(1,activation ='sigmoid')) #, activation='sigmoid'

    # Compile model
    optimizer = Adam(learning_rate=learn_rate, amsgrad=amsgrad)
    model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])
    return model

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# split into input (X) and output (Y) variables
# create model
model = KerasRegressor(build_fn=create_model, verbose=10) #epochs=75, batch_size=10, verbose=10)
# define the grid search parameters
#optimizer = ['Adam'] 
batch_size = [10,20] # 5, 10,
epochs = [50, 75, 100] # ,50, 75,
learn_rate = [0.0001, 0.001, 0.01] #0.0001, , 0.01, 0.0001,0.001,
amsgrad = [False] # True,  #True,
activation = ['relu', 'sigmoid']#, 'softplus'] #, 'sigmoid','softplus'] #, , 'softsign', 'hard_sigmoid', 'softmax', #, 'linear' 
dropout_rate = [0.1, 0.2] #,0.2,0.3]#, 0.2] #0.0,, 0.3, 0.5 0.4, 0.2,, 0.3, 0.4, 0.5, 0.7
neurons = [50, 100] #25, 50, 100, 150,300, 200


param_grid = dict(batch_size=batch_size, epochs=epochs, learn_rate=learn_rate, amsgrad=amsgrad, activation=activation, dropout_rate=dropout_rate, neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5, verbose=10, scoring=('r2', 'neg_root_mean_squared_error'), refit='r2')
grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
print("-------------------------------------------------------------------")
print("Neural Network Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid.best_score_)
print("\n The best parameters across ALL searched params:\n",grid.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid.cv_results_['params']
r2_scores=grid.cv_results_['mean_test_r2']
rmse_scores=grid.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


In [None]:
gd_result = pd.DataFrame(grid.cv_results_)
gd_result=gd_result[['param_batch_size','param_epochs', 'param_neurons','param_activation','param_learn_rate', 'param_dropout_rate', 'mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]
gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

In [None]:
gd_result.to_csv(output_folder+"/"+"gridsearch_NN_w2v_all_5fold_corrected.csv", index=False)

## Grid Search
### Random Forest, when all features are used

In [None]:
import numpy as np
from sklearn import ensemble
GBR = ensemble.RandomForestRegressor()


parameters = {'bootstrap': [True, False],
              'max_depth': [10, 20, 30, None],#5, 
              'max_features': ['auto', 'sqrt'],
              'n_estimators': [32, 64, 100, 500]}#, 1000

y_train_flat=np.ravel(y_train)

from sklearn.model_selection import GridSearchCV
grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters,scoring=('r2', 'neg_root_mean_squared_error'),refit='r2', verbose=10, cv = 5, n_jobs=-1)
grid_GBR.fit(X_train, y_train_flat)

print("-------------------------------------------------------------------")
print("Gradient Boosting Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid_GBR.cv_results_['params']
r2_scores=grid_GBR.cv_results_['mean_test_r2']
rmse_scores=grid_GBR.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


df_gridsearch_result = pd.DataFrame(grid_GBR.cv_results_)


gd_result=df_gridsearch_result[['param_n_estimators','param_max_depth','param_max_features','param_bootstrap','mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]

In [None]:
gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

In [None]:
gd_result.to_csv(output_folder+"/"+"gridsearch_randomforest_w2v_selftrained_all_5fold.csv", index=False)

## Grid Search
### Gradient Boosting, when all features are used

In [None]:
import numpy as np
from sklearn import ensemble
GBR = ensemble.GradientBoostingRegressor()

parameters = {'n_estimators' : [500,1000], # 100 removed
              'max_depth'    : [4,6], # 3 removed
                                       #'min_samples_split': [2, 5, 8],
              'learning_rate': [0.01,0.02], # 0.005 removed
                                     #'loss': ['ls'], # remove huber loss
              'subsample'    : [1, 0.8] 
             }

y_train_flat=np.ravel(y_train)

from sklearn.model_selection import GridSearchCV
grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters,scoring=('r2', 'neg_root_mean_squared_error'),refit='r2', verbose=10, cv = 5, n_jobs=-1)
grid_GBR.fit(X_train, y_train_flat)

print("-------------------------------------------------------------------")
print("Gradient Boosting Grid Search Results")
print("-------------------------------------------------------------------")
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best r2 score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
print("\n")
print("-------------------------------------------------------------------")
print("All Results:") 

param_com=grid_GBR.cv_results_['params']
r2_scores=grid_GBR.cv_results_['mean_test_r2']
rmse_scores=grid_GBR.cv_results_['mean_test_neg_root_mean_squared_error']
print("-------------------------------------------------------------------")
index=0
for item in param_com:
    print("parameter combinations:"+str(item))
    print("\n")
    print("test r2 score:"+str(r2_scores[index]))
    print("\n")
    print("test RMSE score:"+str(rmse_scores[index]*-1))
    print("-------------------------------------------------------------------")
    index=index+1


df_gridsearch_result = pd.DataFrame(grid_GBR.cv_results_)


gd_result=df_gridsearch_result[['param_n_estimators','param_max_depth','param_learning_rate','param_subsample','mean_test_r2','mean_test_neg_root_mean_squared_error'  ]]

In [None]:
gd_result=gd_result.sort_values(by=['mean_test_r2', 'mean_test_neg_root_mean_squared_error'], ascending=False)
gd_result.head(50)

In [None]:
gd_result.to_csv(output_folder+"/"+"gridsearch_gradientboosting_W2V_self_trained_all_5fold.csv", index=False)

## create gradient boosting model

In [None]:
import numpy as np 
params = {'n_estimators': 1000,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}

reg = ensemble.GradientBoostingRegressor(**params)#**params

y_train_flat=np.ravel(y_train)
reg.fit(X_train, y_train_flat)

## do cross validation

In [None]:
import time

tic = time.perf_counter()

In [None]:
from sklearn.model_selection import cross_validate
scores = cross_validate(reg, X_train, y_train_flat, scoring=('r2', 'neg_root_mean_squared_error'), cv=10, return_train_score=True)

In [None]:
print("RMSE training Score using cv: {:0.5f}".format(scores['train_neg_root_mean_squared_error'].mean() * -1))

In [None]:
print("RMSE test Score using cv: {:0.5f}".format(scores['test_neg_root_mean_squared_error'].mean() * -1))

In [None]:
print("R2 training Score using cv: {:0.5f}".format(scores['train_r2'].mean() * -1))

In [None]:
print("R2 test Score using cv: {:0.5f}".format(scores['test_r2'].mean() * -1))

In [None]:
toc = time.perf_counter()
print(f"Finish cross validation in  {(toc - tic)/60:0.4f} minutes")

## Grid Search using all the features