<a href="https://colab.research.google.com/github/Huertas97/Sentiment_Analysis/blob/main/sst2_models/ML/SST2_Gensim_Pretrainded_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

In this notebook we will use a pre-trained Word2Vec model from Gensim to extract the word embeddings that ML algorithms will use to as features to learn how to predict sentiment polarity in English tweets. 

# Import packages

In [None]:
!pip install -U -q Unidecode
import sys
import unidecode
import re
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

from sklearn.feature_extraction.text import TfidfVectorizer


[?25l[K     |█▍                              | 10kB 22.2MB/s eta 0:00:01[K     |██▊                             | 20kB 11.9MB/s eta 0:00:01[K     |████                            | 30kB 8.3MB/s eta 0:00:01[K     |█████▌                          | 40kB 6.8MB/s eta 0:00:01[K     |██████▉                         | 51kB 4.4MB/s eta 0:00:01[K     |████████▏                       | 61kB 4.9MB/s eta 0:00:01[K     |█████████▋                      | 71kB 5.1MB/s eta 0:00:01[K     |███████████                     | 81kB 5.2MB/s eta 0:00:01[K     |████████████▎                   | 92kB 5.3MB/s eta 0:00:01[K     |█████████████▊                  | 102kB 4.3MB/s eta 0:00:01[K     |███████████████                 | 112kB 4.3MB/s eta 0:00:01[K     |████████████████▍               | 122kB 4.3MB/s eta 0:00:01[K     |█████████████████▊              | 133kB 4.3MB/s eta 0:00:01[K     |███████████████████▏            | 143kB 4.3MB/s eta 0:00:01[K     |████████████████████▌    

# Load the SST2 data 

Clone the GitHub repository

In [None]:
# Clone the repository and all the dependencies
!git clone https://github.com/Huertas97/Sentiment_Analysis.git

Cloning into 'Sentiment_Analysis'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 30 (delta 8), reused 23 (delta 4), pack-reused 0[K
Unpacking objects: 100% (30/30), done.


Extract the SST2 train set

In [None]:
import io
import pandas as pd
# Load the data from SST2
def loadFile(fpath):
        sst_data = {'X': [], 'y': []}
        with io.open(fpath, 'r', encoding='utf-8') as f:
            for line in f:
                    sample = line.strip().split('\t')
                    sst_data['y'].append(int(sample[1]))
                    sst_data['X'].append(sample[0])
        assert max(sst_data['y']) == 2 - 1
        return sst_data

In [None]:
sst2_train = loadFile("/content/Sentiment_Analysis/sst_2_data/sentiment-train")
sst2_df_train = pd.DataFrame( {"text": sst2_train["X"], 
               "labels": sst2_train["y"]} )

sst2_dev = loadFile("/content/Sentiment_Analysis/sst_2_data/sentiment-dev")
sst2_df_dev = pd.DataFrame( {"text": sst2_dev["X"], 
               "labels": sst2_dev["y"]} )

sst2_test = loadFile("/content/Sentiment_Analysis/sst_2_data/sentiment-test")
sst2_df_test = pd.DataFrame( {"text": sst2_test["X"], 
               "labels": sst2_test["y"]} )

# Preprocess the text data

For TF-IDF and Word2Vec is important to preprocess the text. This step is important because the quality of the sentence embedding depends on the words that belong to the sentence. If stopwords (i.e, but, and, so) are not removed we will have noise in the embedding since this words do not represent properly our task problem. 

In [None]:
def preprocessor(text, stoptext = "nltk", lemmatizer = "nltk"):
  # sys.stdout.write('.')
  # sys.stdout.flush()

  # Text to unicode
  text = unidecode.unidecode(text)

  # Remove introduction words for sections
  text = re.sub("[A-Z]{0,}\s[A-Z]+:", "", text) 

  # Lowercase and remove extra spaces
  text = text.strip().lower()

  # E mail
  text = re.sub(r"e\s?-\s?mail", "email", text)

  # Substitute p value
  text = re.sub('p\s?[<=]\s?0?[.,]0[0-5]+', 'hppv', text) # Significant
  text = re.sub('p\s?[>=]\s?[\d]+[.,]?\d*', 'lppv', text) # Non-significant

  # Separate punctation to replace numbers for NUM better
  from string import punctuation
  punctuation_marks = set(punctuation)
  punctuation_marks.update(chr(177))

  for i in punctuation_marks:
    element = "\\"+i # scape the character
    sub_element = " "+i+" " # Example "=" --> " = "
    text = re.sub(element, sub_element, text)


  # Substitute irrelevant (isolated) numbers by NUM
  text = re.sub( '[^A-Za-z][\-~]?[0-9][0-9]*\s?[.,]?\s?[0-9]+[^A-Za-z]', " num ", text)
  text = re.sub( "\s[0-9]+\s", " num ", text)

  # Tokenize the text

  tokenized_text = nltk.word_tokenize(text)

  # Delete Punctuation
  tokenized_text = [i for i in tokenized_text if i not in punctuation_marks]

    # Delete stop words
  if stoptext == "spacy":
    stop_words = sorted(spacy_stopwords)

  if stoptext == "nltk":
    nltk_stopwords = nltk.corpus.stopwords.words('english')
    stop_words = sorted(nltk_stopwords)

  if stoptext == "clinical":
    stop_words = sorted(clinical_stopwords)

  if stoptext == "long":
    stop_words = sorted(long_stopwords)

  tokenized_text = [i for i in tokenized_text if i not in stop_words]


  # Lemmanization
  if lemmatizer == "nltk":
    lemmatizer = WordNetLemmatizer().lemmatize
    lemmatized_text = [lemmatizer(word) for word in tokenized_text]

  
  if lemmatizer == "spacy":
    nlp = spacy.load('en', disable=['parser', 'ner'])
    doc = nlp(" ".join(tokenized_text))
    lemmatized_text = [token.lemma_ for token in doc]


  # Join all the text  
  full_text = " ".join(lemmatized_text)

  return full_text

In [None]:
from tqdm.auto import tqdm

In [None]:
clean_sst2_train = [preprocessor(text, stoptext="nltk", lemmatizer="nltk")  for text in tqdm(sst2_df_train.text.to_list(), desc = "Train cleaning")]

clean_sst2_dev = [preprocessor(text, stoptext="nltk", lemmatizer="nltk")  for text in tqdm(sst2_df_dev.text.to_list(), desc = "Dev cleaning")]

clean_sst2_test = [preprocessor(text, stoptext="nltk", lemmatizer="nltk")  for text in tqdm(sst2_df_test.text.to_list(), desc = "Test cleaning")]

HBox(children=(FloatProgress(value=0.0, description='Train cleaning', max=67349.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Dev cleaning', max=872.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Test cleaning', max=1821.0, style=ProgressStyle(descripti…




## Create Word2Vec model

In [None]:
import gensim # pip install gensim
from gensim.models.word2vec import Word2Vec # word2vec model gensim class
TaggedDocument = gensim.models.doc2vec.TaggedDocument

In [None]:
from sklearn.model_selection import train_test_split

tokenized_tr = [ nltk.word_tokenize(text) for text in tqdm(clean_sst2_train, desc= "Tokenize train")]
print(tokenized_tr[0])
tokenized_dev = [ nltk.word_tokenize(text) for text in tqdm(clean_sst2_dev, "Tokenize dev")]
tokenized_te = [ nltk.word_tokenize(text) for text in tqdm(clean_sst2_test, "Tokenize test")]

HBox(children=(FloatProgress(value=0.0, description='Tokenize train', max=67349.0, style=ProgressStyle(descrip…


['hide', 'new', 'secretion', 'parental', 'unit']


HBox(children=(FloatProgress(value=0.0, description='Tokenize dev', max=872.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Tokenize test', max=1821.0, style=ProgressStyle(descripti…




# TF-IDF embeddings

TF-IDF has several parameters `max_features` (words that will be used as features. This is the vocabulary extracted for computing TF-IDF), `min_df` and `man_df` (words below or above these thresholds will be omitted for building the vocabulary), `ngram_range` (select if considering unigrams, bigrams, trigrams...). 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

print('building tf-idf matrix ...')
max_features = 5000
vectorizer = TfidfVectorizer(max_features=max_features, min_df=0, max_df=0.8, 
                    strip_accents='unicode', ngram_range=(1, 3))
                                                                            
vectorizer.fit(clean_sst2_train)
IDFs = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('size of vocabulary obtained with TfidfVectorizer:', len(IDFs))
# print('size of vocabulary obtained with word2vec:', len(w2v.wv.vocab))
print("Some idfs:")
aux = list(IDFs.items())
for i in list(range(3))+list(range(1000,1005)):
    print("  ", aux[i])

building tf-idf matrix ...
size of vocabulary obtained with TfidfVectorizer: 5000
Some idfs:
   ('19th', 9.284444837782294)
   ('19th century', 9.284444837782294)
   ('20th', 9.026615728480195)
   ('debt', 9.227286423942346)
   ('debut', 7.183184248707819)
   ('decade', 7.760949355148918)
   ('decent', 7.313637137105253)
   ('decent performance', 9.227286423942346)


# Combine TF-IDF and Word2Vec

In [None]:


def Text2Vec(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += glove_vectors.wv[word].reshape((1, size)) * IDFs[word] # el embedding lo multiplica por el IDF
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
# Download the pre-trained model from Gensim
import gensim.downloader
glove_vectors = gensim.downloader.load('word2vec-google-news-300')



In [None]:
vec_dim = 300
vecs_train = np.zeros((len(tokenized_tr), vec_dim))
for i,x in tqdm(enumerate(tokenized_tr), total=len(tokenized_tr), desc="Train vecs"):
    vecs_train[i] = Text2Vec(x, vec_dim)

vecs_dev = np.zeros((len(tokenized_dev), vec_dim))
for i,x in tqdm(enumerate(tokenized_dev), total=len(tokenized_dev), desc="Dev vecs"):
    vecs_dev[i] = Text2Vec(x, vec_dim)

vecs_test = np.zeros((len(tokenized_te), vec_dim))
for i,x in tqdm(enumerate(tokenized_te), total=len(tokenized_te), desc="Test vecs"):
    vecs_test[i] = Text2Vec(x, vec_dim)

HBox(children=(FloatProgress(value=0.0, description='Train vecs', max=67349.0, style=ProgressStyle(description…

  





HBox(children=(FloatProgress(value=0.0, description='Dev vecs', max=872.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Test vecs', max=1821.0, style=ProgressStyle(description_w…




# ML models

In [None]:

from sklearn import model_selection, naive_bayes, svm
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, matthews_corrcoef

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [None]:
!pip install wandb -qq

## Naive Bayes

In [None]:
y_tr = sst2_df_train.labels.to_list()
y_te = sst2_df_test.labels.to_list()

In [None]:
from sklearn.naive_bayes import GaussianNB
import wandb
wandb.init(project="sklearn-sst2-gensim")
y_tr = sst2_df_train.labels.to_list()
y_te = sst2_df_test.labels.to_list()


Naive = naive_bayes.GaussianNB()
Naive.fit(vecs_train, y_tr)

# predict the labels on validation dataset
predictions_NB = Naive.predict(vecs_test)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(y_te, predictions_NB)*100)
# Print the precision and recall, among other metrics
print(metrics.classification_report(y_te, predictions_NB, digits=3))
# Print the confusion matrix
print(metrics.confusion_matrix(y_te, predictions_NB))
print("MCC", matthews_corrcoef(y_te, predictions_NB))

# Visualize all classifier plots
wandb.sklearn.plot_classifier(Naive, vecs_train, vecs_test, y_tr, y_te, predictions_NB, y_probas=Naive.predict_proba(vecs_test),
                              labels= ["Negative", "Positive"],
                              model_name='Naive Bayes', 
                              feature_names= None)
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting Naive Bayes.
[34m[1mwandb[0m: Logged feature importances.


Naive Bayes Accuracy Score ->  66.61175178473367
              precision    recall  f1-score   support

           0      0.857     0.400     0.546       912
           1      0.608     0.933     0.736       909

    accuracy                          0.666      1821
   macro avg      0.732     0.667     0.641      1821
weighted avg      0.733     0.666     0.641      1821

[[365 547]
 [ 61 848]]
MCC 0.39343999097342797


[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision recall curve.


VBox(children=(Label(value=' 0.04MB of 0.04MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
_step,6
_runtime,26
_timestamp,1610043027


0,1
_step,▁▂▃▅▆▇█
_runtime,▁▁▅▅███
_timestamp,▁▁▅▅███


## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(fit_intercept=True, random_state=0, max_iter=1000,
                           penalty='l1', solver = "liblinear")

model.fit(vecs_train, y_tr)


# predict the labels on validation dataset
predictions_LR = model.predict(vecs_test)

# Use accuracy_score function to get the accuracy
print("Logistic Regression Accuracy Score -> ",accuracy_score(y_te, predictions_LR)*100)
print(metrics.classification_report(y_te, predictions_LR))

# L1

In [None]:
from sklearn.metrics import  make_scorer
# Set the parameters by cross-validation
tuned_parameters = [{'C': np.logspace(-3, 1, 6), "max_iter": [1000]}]

scores = ["accuracy"]

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()
          
    knn = GridSearchCV(
        LogisticRegression(penalty='l1', solver = "liblinear"), 
        cv=5, 
        param_grid=tuned_parameters,
        scoring=make_scorer(accuracy_score),
        n_jobs = 2
      )
      
    knn.fit(vecs_train, y_tr)

    print("Best parameters set found on development set:")
    print()
    print(knn.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = knn.cv_results_['mean_test_score']
    stds = knn.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, knn.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_te, knn.predict(vecs_test)
    print("Accuracy Score -> ",accuracy_score(y_true, y_pred)*100)
    print("MCC", matthews_corrcoef(y_true, y_pred))
    print(metrics.classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'C': 0.039810717055349734, 'max_iter': 1000}

Grid scores on development set:

0.763 (+/-0.003) for {'C': 0.001, 'max_iter': 1000}
0.790 (+/-0.002) for {'C': 0.00630957344480193, 'max_iter': 1000}
0.796 (+/-0.003) for {'C': 0.039810717055349734, 'max_iter': 1000}
0.796 (+/-0.003) for {'C': 0.25118864315095824, 'max_iter': 1000}
0.796 (+/-0.003) for {'C': 1.584893192461114, 'max_iter': 1000}
0.796 (+/-0.003) for {'C': 10.0, 'max_iter': 1000}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

Accuracy Score ->  77.64964305326744
MCC 0.5613056067357155
              precision    recall  f1-score   support

           0       0.83      0.69      0.76       912
           1       0.74      0.86      0.79       909

    accuracy                           0.78      1821
   macro avg       0.78      0.78      0.77      182

In [None]:
import wandb
wandb.init(project="sklearn-sst2-gensim")
log_l1 = LogisticRegression(penalty='l1', solver = "liblinear", max_iter = 1000,
                         C=0.039810717055349734)
log_l1.fit(vecs_train, y_tr)
y_true, y_pred = y_te, log_l1.predict(vecs_test)
print("Accuracy Score -> ",accuracy_score(y_true, y_pred)*100)

# predict the labels on validation dataset
predictions_LR = log_l1.predict(vecs_test)

# Visualize all classifier plots
wandb.sklearn.plot_classifier(log_l1, vecs_train, vecs_test, y_tr, y_te, predictions_LR, y_probas=log_l1.predict_proba(vecs_test),
                              labels= ["Negative", "Positive"],
                              model_name='Log Reg L1', 
                              feature_names= None)
wandb.finish()

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting Log Reg L1.
[34m[1mwandb[0m: Logged feature importances.


Accuracy Score ->  77.64964305326744


[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision recall curve.


VBox(children=(Label(value=' 0.04MB of 0.04MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
_step,7
_runtime,58
_timestamp,1610043565


0,1
_step,▁▂▃▄▅▆▇█
_runtime,▁▆▆▇▇███
_timestamp,▁▆▆▇▇███


# KNN

In [None]:
from sklearn.metrics import  make_scorer
from sklearn.neighbors import KNeighborsClassifier

# Set the parameters by cross-validation
tuned_parameters = [{'n_neighbors':[1, 3, 5, 7, 11, 15, 20, 25, 30, 50, 100, 150, 200]}]

scores = ["accuracy"]

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()
          
    knn = GridSearchCV(
        KNeighborsClassifier(), tuned_parameters, scoring=make_scorer(accuracy_score), cv = 5 
      )
      
    knn.fit(vecs_train, y_tr)

    print("Best parameters set found on development set:")
    print()
    print(knn.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = knn.cv_results_['mean_test_score']
    stds = knn.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, knn.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_te, knn.predict(vecs_test)
    print("Accuracy Score -> ",accuracy_score(y_true, y_pred)*100)
    print("MCC", matthews_corrcoef(y_true, y_pred))
    print(metrics.classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'n_neighbors': 1}

Grid scores on development set:

0.879 (+/-0.010) for {'n_neighbors': 1}
0.864 (+/-0.005) for {'n_neighbors': 3}
0.850 (+/-0.005) for {'n_neighbors': 5}
0.839 (+/-0.007) for {'n_neighbors': 7}
0.823 (+/-0.009) for {'n_neighbors': 11}
0.814 (+/-0.008) for {'n_neighbors': 15}
0.810 (+/-0.007) for {'n_neighbors': 20}
0.800 (+/-0.005) for {'n_neighbors': 25}
0.800 (+/-0.004) for {'n_neighbors': 30}
0.790 (+/-0.004) for {'n_neighbors': 50}
0.778 (+/-0.003) for {'n_neighbors': 100}
0.773 (+/-0.004) for {'n_neighbors': 150}
0.771 (+/-0.004) for {'n_neighbors': 200}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

Accuracy Score ->  65.40362438220758
MCC 0.3166991832162829
              precision    recall  f1-score   support

           0       0.70      0.54      0.61       912
           1       0.6

# RF

In [None]:
from sklearn.metrics import  make_scorer
from sklearn.ensemble import RandomForestClassifier

# Set the parameters by cross-validation
tuned_parameters = [{'n_estimators':[50, 100, 150, 200, 300, 400, 500],
                     'max_depth': [10, 20, 30, 40]}]

scores = ["accuracy"]

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()
          
    knn = GridSearchCV(
        RandomForestClassifier( max_depth=3, random_state=0), tuned_parameters, scoring=make_scorer(accuracy_score), cv = 2 
      )
      
    knn.fit( vecs_train, y_tr)

    print("Best parameters set found on development set:")
    print()
    print(knn.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = knn.cv_results_['mean_test_score']
    stds = knn.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, knn.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_te, knn.predict(vecs_test)
    print("Accuracy Score -> ",accuracy_score(y_true, y_pred)*100)
    print("MCC", matthews_corrcoef(y_true, y_pred))
    print(metrics.classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'max_depth': 30, 'n_estimators': 400}

Grid scores on development set:

0.823 (+/-0.002) for {'max_depth': 10, 'n_estimators': 50}
0.828 (+/-0.005) for {'max_depth': 10, 'n_estimators': 100}
0.830 (+/-0.006) for {'max_depth': 10, 'n_estimators': 150}
0.831 (+/-0.005) for {'max_depth': 10, 'n_estimators': 200}
0.831 (+/-0.004) for {'max_depth': 10, 'n_estimators': 300}
0.832 (+/-0.004) for {'max_depth': 10, 'n_estimators': 400}
0.833 (+/-0.003) for {'max_depth': 10, 'n_estimators': 500}
0.849 (+/-0.003) for {'max_depth': 20, 'n_estimators': 50}
0.854 (+/-0.003) for {'max_depth': 20, 'n_estimators': 100}
0.856 (+/-0.004) for {'max_depth': 20, 'n_estimators': 150}
0.857 (+/-0.004) for {'max_depth': 20, 'n_estimators': 200}
0.859 (+/-0.004) for {'max_depth': 20, 'n_estimators': 300}
0.859 (+/-0.003) for {'max_depth': 20, 'n_estimators': 400}
0.860 (+/-0.003) for {'max_depth': 20, 'n_estimators': 500}
0.