### Librairies and Packages

In [3]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import cross_val_score
from IPython.display import Image  
#from pydotplus import *
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
### Import of datas

In [5]:
data = pd.read_csv("/Users/FelicieBizeul/Desktop/ETUDES/ENSAI/Erasmus/SL/project/Try_bow/climate_change_tweets_sample_to_work_on.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,username,user_handle,date,retweets,favorites,text,label
0,0,WWF Climate & Energy,climateWWF,2020-04-28,11,22,Economic recovery and national climate pledges...,0
1,1,WWF Climate & Energy,climateWWF,2020-04-22,6,16,In this difficult time it’s hard to connect w...,0
2,2,WWF Climate & Energy,climateWWF,2020-04-01,43,69,The decision to postpone COP is unavoidab...,0
3,3,WWF Climate & Energy,climateWWF,2020-03-30,24,30,Japan the world’s fifth largest emitter of g...,0
4,4,WWF Climate & Energy,climateWWF,2020-03-30,22,40,How can countries include NatureBasedSolutio...,0


Here we use the file which already have been pre-processed, so our datas are cleans of '#' '@' etc

### Define the train and test samples of X and Y

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(data, data.label, test_size=0.33, random_state=42)

print("X_train.shape is : ", X_train.shape)
print("X_test.shape is : ", X_test.shape)
print("Y_train.shape is : ", Y_train.shape)
print("Y_test.shape is : ", Y_test.shape)

X_train.shape is :  (12066, 8)
X_test.shape is :  (5943, 8)
Y_train.shape is :  (12066,)
Y_test.shape is :  (5943,)


### Instantiation of our bags of words

In [7]:
# Declaration of the object with some conditions
vectorizer = CountVectorizer(stop_words = 'english', binary=True, min_df = 10)

# For create the document term matrix
X_train = vectorizer.fit_transform(X_train["text"])
X_test = vectorizer.transform(X_test["text"])

In [8]:
## Random forest

Firstly, we perform a random forest with random hyperparamaters

In [9]:
rf_random_hp = RandomForestClassifier(n_estimators = 10, random_state=0)
rf_random_hp.fit(X_train,Y_train)
print("Training set score : {:.3f}".format(rf_random_hp.score(X_train, Y_train)))
print("Test set score : {:.3f}".format(rf_random_hp.score(X_test, Y_test)))

Training set score : 0.995
Test set score : 0.907


In [10]:
scores_rf_random_hp = cross_val_score(rf_random_hp, X_train, Y_train, cv=10)
print("CV accuracy : {}".format(scores_rf_random_hp)) 
print("Mean CV accuracy : {:.2f}".format(np.mean(scores_rf_random_hp)))

CV accuracy : [0.89478045 0.90886495 0.90637945 0.90803645 0.91300746 0.91052196
 0.92288557 0.90298507 0.90961857 0.91127695]
Mean CV accuracy : 0.91


In [12]:
y_pred_training = rf_random_hp.predict(X_train)
con_mat_training = confusion_matrix(Y_train,y_pred_training)
print("Matrice de confusion d'entraînement' : {}".format(con_mat_training))

Matrice de confusion d'entraînement' : [[5008   15]
 [  45 6998]]


In [13]:
y_pred_testing = rf_random_hp.predict(X_test)
con_mat_testing = confusion_matrix(Y_test,y_pred_testing)
print("Matrice de confusion de test : {}".format(con_mat_testing))

Matrice de confusion de test : [[2182  233]
 [ 319 3209]]


Then, we tried to find the best random forest model. We need to import some news packages : 

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
from sklearn.ensemble import RandomForestRegressor

In [18]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 2000, num = 10)]

In [19]:
# Number of features to consider at every split
max_features = ['auto', 'sqrt']

In [20]:
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 110, num = 11)]
max_depth.append(None)

In [21]:
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

In [22]:
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

In [23]:
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [24]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [5, 15, 26, 36, 47, 57, 68, 78, 89, 99, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [10, 231, 452, 673, 894, 1115, 1336, 1557, 1778, 2000]}


In [None]:
# Give the best parameter for a random forest model
search_hp = RandomizedSearchCV(estimator = RandomForestRegressor(), param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=0, n_jobs = -1)
search_hp.fit(X_train,Y_train) # Be careful it take a little bit of time!!!
search_hp.best_params_

In [None]:
I would like to try with n_iter = 100 but it was too long to compute, so I tried with n_iter = 1 just to see
if it was working, and it was. I will make to program turn this night with 100.

In [None]:
# With n_iter = 1, we have :
best_rf_model = RandomForestClassifier(n_estimators = 100 ,min_samples_split= 2,min_samples_leaf= 1,max_features = 'sqrt', max_depth = None ,bootstrap = True,random_state=0)
best_rf_model.fit(X_train,Y_train)

In [None]:
scores_best_rf_model = cross_val_score(best_rf_model, X_train, Y_train, cv=10)
mean_best = np.mean(scores_best_rf_model)