# Usando nuevo dataset

# Importando dataset

In [54]:
import jsonlines
import pandas as pd

with jsonlines.open('Dataset/SMHD/SMHD_eating_train.jsonl') as reader:
    count=0
    df_train = pd.DataFrame()
    for obj in reader:
        count=count+1
        lines = reader.read()
        df_inter=pd.json_normalize(lines, record_path=['posts'], meta=['id', 'label'])
        df_train=pd.concat([df_train,df_inter], axis=0)
        
with jsonlines.open('Dataset/SMHD/SMHD_eating_test.jsonl') as reader:
    count=0
    df_test = pd.DataFrame()
    for obj in reader:
        count=count+1
        lines = reader.read()
        df_inter=pd.json_normalize(lines, record_path=['posts'], meta=['id', 'label'])
        df_test=pd.concat([df_test,df_inter], axis=0)

with jsonlines.open('Dataset/SMHD/SMHD_eating_dev.jsonl') as reader:
    count=0
    df_dev = pd.DataFrame()
    for obj in reader:
        count=count+1
        lines = reader.read()
        df_inter=pd.json_normalize(lines, record_path=['posts'], meta=['id', 'label'])
        df_dev=pd.concat([df_dev,df_inter], axis=0)

### Borramos columnas innecesarias

In [55]:
data = pd.concat([df_train, df_test, df_dev])
data.drop(['selftext', 'body', 'created_utc', 'title'], axis=1, inplace=True)

In [56]:
data

Unnamed: 0,text,id,label
0,Added :),2003202,1
1,Added :),2003202,1
2,I lost 80lbs after moving out of my parents ho...,2003202,1
3,I tried adding you again. Let me know if it wo...,2003202,1
4,Added!,2003202,1
...,...,...,...
497,Thanks,1455910,0
498,[removed],1455910,0
499,Woooosh,1455910,0
500,Ipad air 2 - 9.0.1 jailbreak. Should I update?...,1455910,0


# Hacer join de los text de los mismos ids

In [57]:
data_grouped=data.groupby(['id', 'label']).text.apply(list).transform(lambda x : ' '.join(x)).reset_index()

# Tokenizado y eliminacion de stopwords

In [58]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

def preprocess(words, type='doc'):
    tokens = nltk.word_tokenize(words)
    porter = nltk.PorterStemmer()
    lemmas = [porter.stem(t) for t in tokens]
    stoplist = stopwords.words('english')
    lemmas_clean = [w for w in tokens if w not in stoplist]
    punctuation = set(string.punctuation)
    words = [w for w in lemmas_clean if  w not in punctuation]
    return words

def preprocesssimple(words, type='doc'):
    stoplist = stopwords.words('english')
    lemmas_clean = [w for w in words if w not in stoplist]
    return words

# Modelizacion

In [60]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

x = data_grouped['text']
y = data_grouped['label']

vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')

x = vectorizer.fit_transform(x.apply(lambda x: ' '.join(x)))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [61]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
y_train=y_train.astype('int')
classifier.fit(x_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=0)

In [62]:
y_pred = classifier.predict(x_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[42  5]
 [ 9 44]]
              precision    recall  f1-score   support

           0       0.82      0.89      0.86        47
           1       0.90      0.83      0.86        53

    accuracy                           0.86       100
   macro avg       0.86      0.86      0.86       100
weighted avg       0.86      0.86      0.86       100

0.86


# Kfold CrossValScore

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score

In [69]:
kf = KFold(n_splits=5, shuffle=False)
print('KFold CrossValScore Using Random Forest %s with 5 splits' % cross_val_score(classifier,x, y, cv=5).mean())

kf = KFold(n_splits=10, shuffle=False)
print('KFold CrossValScore Using Random Forest %s with 10 splits' % cross_val_score(classifier,x, y, cv=10).mean())

kf = KFold(n_splits=15, shuffle=False)
print('KFold CrossValScore Using Random Forest %s with 15 splits' % cross_val_score(classifier,x, y, cv=15).mean())

kf = KFold(n_splits=20, shuffle=False)
print('KFold CrossValScore Using Random Forest %s with 20 splits' % cross_val_score(classifier,x, y, cv=20).mean())

KFold CrossValScore Using Random Forest 0.8368611488014472 with 5 splits
KFold CrossValScore Using Random Forest 0.8277183600713013 with 10 splits
KFold CrossValScore Using Random Forest 0.833596837944664 with 15 splits
KFold CrossValScore Using Random Forest 0.8277573529411765 with 20 splits


In [73]:
import pickle
pickle.dump(classifier, open("RandomForest.sav", 'wb'))
pickle.dump(vectorizer, open("TfidfVectorizer.pickle", 'wb'))

In [76]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [79]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator =classifier, param_grid = random_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 4320 candidates, totalling 12960 fits
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   3.1s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1200; total time=   8.1s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time=  13.8s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=1600; total time=  11.9s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=800; total time=   5.3s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=1400; total time=   9.4s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=

In [85]:
# print best parameter after tuning
print(grid_search.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid_search.best_estimator_)

{'bootstrap': True, 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 400}
RandomForestClassifier(max_depth=10, min_samples_split=10, n_estimators=400,
                       random_state=0)


In [92]:
# import the builtin time module
import time

# Grab Currrent Time Before Running the Code
start = time.time()

classifier = RandomForestClassifier(max_depth=10, min_samples_split=10, n_estimators=400,random_state=0)
y_train=y_train.astype('int')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)


# Grab Currrent Time After Running the Code
end = time.time()

#Subtract Start Time from The End Time
total_time = end - start
print("\n"+ str(total_time))

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))


1.7763688564300537
[[43  4]
 [10 43]]
              precision    recall  f1-score   support

           0       0.81      0.91      0.86        47
           1       0.91      0.81      0.86        53

    accuracy                           0.86       100
   macro avg       0.86      0.86      0.86       100
weighted avg       0.87      0.86      0.86       100

0.86
