# Random Forest

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

## Preprocesamiento

In [2]:
#Importar la base de datos
#os.chdir('/Users/santiagoherreragarcia/OneDrive - Universidad de los Andes/Semestre 2022-1/Machine Learning/Trabajo Final/Data')
os.chdir('/Users/JuanJose/Library/CloudStorage/OneDrive-UniversidaddelosAndes/Uniandes/8 Semestre/Machine Learning/Proyecto Final/Trabajo Final/Data')
marcas=pd.read_pickle('Y Botometer.pkl')
main_characteristics=pd.read_pickle('model_characteristics.pkl')
#Encontrar cuentas marcadas:
main_characteristics=main_characteristics.merge(marcas)
marcas=main_characteristics['bot']
main_characteristics.drop(columns=['bot'], inplace=True)

In [3]:
lisdir=os.listdir()

In [4]:
dfs=[]
for i in os.listdir():
    if 'user_model' in i:
        print(i)
        df=pd.read_pickle(i)
        if i=="user_model_juandi.pkl":
            df=df[['Author ID','ADJ%','ADP%', 'ADV%', 'AUX%', 'CCONJ%', 'DET%', 'INTJ%', 'PART%', 'PRON%','PROPN%', 'PUNCT%', 'SCONJ%', 'SYM%', 'VERB%', 'X%', 'SPACE%']]
            df=df.groupby(['Author ID'], as_index=False).mean()
        dfs.append(df)
for i in range(0, len(dfs)):
    print(i)
    dfs[i]['Author ID']=dfs[i]['Author ID'].astype(int).astype(str)
    main_characteristics=main_characteristics.merge(dfs[i], on='Author ID')
main_characteristics.drop(columns=['Author ID'], inplace=True)

user_model_texto.pkl
user_model_analyze_descriptions.pkl
user_model_juandi.pkl
0
1
2


In [6]:
#Split
x_train, x_test, y_train, y_test = train_test_split(main_characteristics, marcas, 
                                                    test_size=0.3, random_state=123)

In [7]:
#Sólo estandarizamos
standard=('standardize', StandardScaler())
scaler=Pipeline(steps=[standard])

In [8]:
#Aplicar estandarizacion
columns_scale=x_train.drop(columns=['Author Verified']).columns
x_train[columns_scale]=scaler.fit_transform(x_train[columns_scale])
x_test[columns_scale]=scaler.fit_transform(x_test[columns_scale])

## Modelo

In [9]:
clf=RandomForestClassifier(random_state=123, n_jobs=-1)

In [10]:
clf.fit(x_train, y_train)

RandomForestClassifier(n_jobs=-1, random_state=123)

In [11]:
def model_cv(x,y,preprocessor,classifier,search_space):
    pipeline=Pipeline(
    steps=[('preprocesser', preprocessor), ('classifier', classifier)], 
    )
    clf=GridSearchCV(pipeline, search_space, scoring='accuracy', cv=3, verbose=4)
    clf.fit(x,y)
    
    return clf
    

In [12]:
yhat=clf.predict(x_test)

In [13]:
#Organizar los resultados por relevancia
Relevancia_PM=pd.DataFrame(clf.feature_importances_, index=x_train.columns).sort_values(by=0, ascending=False)
Relevancia_PM.to_latex(buf="Relevancia_PM.tex",index=False)

Unnamed: 0,0
TweetFeature Retweet,0.13009
TweetFeature Reply,0.090616
Tweet Retweets,0.044378
ADP%,0.032028
Author Following,0.03192
Author Followers,0.030429
Tweet Favorites,0.029596
TweetFeature Quoted,0.026807
SPACE%,0.024718
TweetFeature Distance Between Tweets (sd),0.02356


In [14]:
print('Accuracy: '+str(accuracy_score(yhat, y_test)))
print('Precision: '+str(precision_score(yhat, y_test)))
print('Recall: '+str(recall_score(yhat, y_test)))
print('F1: '+str(f1_score(yhat, y_test)))

Accuracy: 0.9122448979591836
Precision: 0.6217948717948718
Recall: 0.5808383233532934
F1: 0.6006191950464397


## Hiperparámetros

Primero un random search:

In [15]:
# Number of trees in random forest
n_estimators = np.linspace(100, 3000, int((3000-100)/200) + 1, dtype=int)
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [1, 5, 10, 20, 50, 75, 100, 150, 200]
# Minimum number of samples required to split a node
# min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 10, num = 9)]
min_samples_split = [1, 2, 5, 10, 15, 20, 30]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Criterion
criterion=['gini', 'entropy']
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion': criterion}

In [16]:
rf_base = RandomForestClassifier(random_state=123)
rf_random = RandomizedSearchCV(estimator = rf_base,
                               param_distributions = random_grid,
                               n_iter = 30, cv = 5,
                               verbose=2,
                               random_state=42, n_jobs = 4)
rf_random.fit(x_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END bootstrap=False, criterion=entropy, max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=3000; total time=  55.9s
[CV] END bootstrap=False, criterion=entropy, max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=3000; total time=  56.0s
[CV] END bootstrap=False, criterion=entropy, max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=3000; total time=  56.1s
[CV] END bootstrap=False, criterion=gini, max_depth=1, max_features=sqrt, min_samples_leaf=3, min_samples_split=10, n_estimators=100; total time=   0.4s
[CV] END bootstrap=False, criterion=gini, max_depth=1, max_features=sqrt, min_samples_leaf=3, min_samples_split=10, n_estimators=100; total time=   0.4s
[CV] END bootstrap=False, criterion=entropy, max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=3000; total time=  56.5s
[CV] END

10 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 450, in fit
    trees = Parallel(
  File "/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/opt/anaconda3/lib/p

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=123),
                   n_iter=30, n_jobs=4,
                   param_distributions={'bootstrap': [True, False],
                                        'criterion': ['gini', 'entropy'],
                                        'max_depth': [1, 5, 10, 20, 50, 75, 100,
                                                      150, 200],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 3, 4],
                                        'min_samples_split': [1, 2, 5, 10, 15,
                                                              20, 30],
                                        'n_estimators': array([ 100,  307,  514,  721,  928, 1135, 1342, 1550, 1757, 1964, 2171,
       2378, 2585, 2792, 3000])},
                   random_state=42, verbose=2)

In [26]:
random_params=rf_random.best_params_

Luego intentamos con un grid search, una vez tengamos mejores candidatos

In [18]:
param_grid = {
    'n_estimators': np.linspace(random_params['n_estimators']-10, random_params['n_estimators']+10, 2, dtype = int),
    'max_depth': [abs(random_params['max_depth']-2),
                  abs(random_params['max_depth']-1),random_params['max_depth'], random_params['max_depth']+1,
                 random_params['max_depth']+2],
    'min_samples_split': [abs(random_params['min_samples_split']-1),random_params['min_samples_split'],
                         random_params['min_samples_split']+1],
    'min_samples_leaf': [abs(random_params['min_samples_leaf']-1), random_params['min_samples_leaf'],
                        random_params['min_samples_leaf']+1]
}
rf_base = RandomForestClassifier(random_state=123, criterion='gini',bootstrap=True)
rf_grid = GridSearchCV(estimator = rf_base,
                               param_grid = param_grid,
                               cv = 5,
                               verbose=2,
                               n_jobs = 4)
rf_grid.fit(x_train, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV] END max_depth=148, min_samples_leaf=1, min_samples_split=9, n_estimators=1125; total time=  21.9s
[CV] END max_depth=148, min_samples_leaf=1, min_samples_split=9, n_estimators=1125; total time=  22.7s
[CV] END max_depth=148, min_samples_leaf=1, min_samples_split=9, n_estimators=1125; total time=  22.9s
[CV] END max_depth=148, min_samples_leaf=1, min_samples_split=9, n_estimators=1125; total time=  23.7s
[CV] END max_depth=148, min_samples_leaf=1, min_samples_split=9, n_estimators=1125; total time=  22.1s
[CV] END max_depth=148, min_samples_leaf=1, min_samples_split=9, n_estimators=1145; total time=  21.7s
[CV] END max_depth=148, min_samples_leaf=1, min_samples_split=9, n_estimators=1145; total time=  22.4s
[CV] END max_depth=148, min_samples_leaf=1, min_samples_split=9, n_estimators=1145; total time=  23.5s
[CV] END max_depth=148, min_samples_leaf=1, min_samples_split=9, n_estimators=1145; total time=  22.9s
[CV] END ma

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=123), n_jobs=4,
             param_grid={'max_depth': [148, 149, 150, 151, 152],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [9, 10, 11],
                         'n_estimators': array([1125, 1145])},
             verbose=2)

In [25]:
rf_grid.best_params_

{'max_depth': 148,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 1125}

In [27]:
clf_hyperparams=RandomForestClassifier(random_state=123, n_jobs=-1,n_estimators=1125,min_samples_split=10,min_samples_leaf=1,max_depth=148)

In [28]:
clf_hyperparams.fit(x_train, y_train)

RandomForestClassifier(max_depth=148, min_samples_split=10, n_estimators=1125,
                       n_jobs=-1, random_state=123)

In [29]:
yhat_hiper=clf_hyperparams.predict(x_test)

In [30]:
pd.DataFrame(clf_hyperparams.feature_importances_, index=x_train.columns).sort_values(by=0, ascending=False)


Unnamed: 0,0
TweetFeature Retweet,0.130253
TweetFeature Reply,0.107741
Tweet Retweets,0.047283
Tweet Favorites,0.045829
TweetFeature Quoted,0.035757
ADP%,0.033054
Author Following,0.031568
Author Followers,0.030153
SPACE%,0.026081
Author Tweets,0.024502


In [31]:
print('Accuracy: '+str(accuracy_score(yhat_hiper, y_test)))
print('Precision: '+str(precision_score(yhat_hiper, y_test)))
print('Recall: '+str(recall_score(yhat_hiper, y_test)))
print('F1: '+str(f1_score(yhat_hiper, y_test)))

Accuracy: 0.9163265306122449
Precision: 0.6153846153846154
Recall: 0.6037735849056604
F1: 0.6095238095238096
