# Usando nuevo dataset

# Importando dataset

In [13]:
import jsonlines
import pandas as pd

with jsonlines.open('Dataset/SMHD/SMHD_eating_train.jsonl') as reader:
    count=0
    df_train = pd.DataFrame()
    for obj in reader:
        count=count+1
        lines = reader.read()
        df_inter=pd.json_normalize(lines, record_path=['posts'], meta=['id', 'label'])
        df_train=pd.concat([df_train,df_inter], axis=0)
        
with jsonlines.open('Dataset/SMHD/SMHD_eating_test.jsonl') as reader:
    count=0
    df_test = pd.DataFrame()
    for obj in reader:
        count=count+1
        lines = reader.read()
        df_inter=pd.json_normalize(lines, record_path=['posts'], meta=['id', 'label'])
        df_test=pd.concat([df_test,df_inter], axis=0)

with jsonlines.open('Dataset/SMHD/SMHD_eating_dev.jsonl') as reader:
    count=0
    df_dev = pd.DataFrame()
    for obj in reader:
        count=count+1
        lines = reader.read()
        df_inter=pd.json_normalize(lines, record_path=['posts'], meta=['id', 'label'])
        df_dev=pd.concat([df_dev,df_inter], axis=0)

### Borramos columnas innecesarias

In [14]:
data = pd.concat([df_train, df_test, df_dev])
data.drop(['selftext', 'body', 'created_utc', 'title'], axis=1, inplace=True)

In [15]:
data

Unnamed: 0,text,id,label
0,Added :),2003202,1
1,Added :),2003202,1
2,I lost 80lbs after moving out of my parents ho...,2003202,1
3,I tried adding you again. Let me know if it wo...,2003202,1
4,Added!,2003202,1
...,...,...,...
497,Thanks,1455910,0
498,[removed],1455910,0
499,Woooosh,1455910,0
500,Ipad air 2 - 9.0.1 jailbreak. Should I update?...,1455910,0


# Hacer join de los text de los mismos ids

In [16]:
data_grouped=data.groupby(['id', 'label']).text.apply(list).transform(lambda x : ' '.join(x)).reset_index()

# Tokenizado y eliminacion de stopwords

In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

def preprocess(words, type='doc'):
    tokens = nltk.word_tokenize(words)
    porter = nltk.PorterStemmer()
    lemmas = [porter.stem(t) for t in tokens]
    stoplist = stopwords.words('english')
    lemmas_clean = [w for w in tokens if w not in stoplist]
    punctuation = set(string.punctuation)
    words = [w for w in lemmas_clean if  w not in punctuation]
    return words

def preprocesssimple(words, type='doc'):
    stoplist = stopwords.words('english')
    lemmas_clean = [w for w in words if w not in stoplist]
    return words

In [18]:
data_grouped['text'] = data_grouped.apply(lambda row: preprocess(row['text']), axis=1)

# Modelizacion

In [19]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

x = data_grouped['text']
y = data_grouped['label']

vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')

x = vectorizer.fit_transform(x.apply(lambda x: ' '.join(x)))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [21]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
y_train=y_train.astype('int')
classifier.fit(x_train, y_train)

LogisticRegression()

In [22]:
y_pred = classifier.predict(x_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score
from sklearn.feature_selection import f_regression
import numpy as np

#MAE
#print(mean_absolute_error(y_test, y_pred))
#MSE
#print(mean_squared_error(y_test, y_pred))
#RMSE
#print(np.sqrt(mean_squared_error(y_test,y_pred)))

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[44  3]
 [11 42]]
              precision    recall  f1-score   support

           0       0.80      0.94      0.86        47
           1       0.93      0.79      0.86        53

    accuracy                           0.86       100
   macro avg       0.87      0.86      0.86       100
weighted avg       0.87      0.86      0.86       100

0.86


# KFold CrossValScore

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score

In [25]:
kf = KFold(n_splits=5, shuffle=False)
print('KFold CrossValScore Using Random Forest %s with 5 splits' % cross_val_score(classifier,x, y, cv=5).mean())

kf = KFold(n_splits=10, shuffle=False)
print('KFold CrossValScore Using Random Forest %s with 10 splits' % cross_val_score(classifier,x, y, cv=5).mean())

kf = KFold(n_splits=15, shuffle=False)
print('KFold CrossValScore Using Random Forest %s with 15 splits' % cross_val_score(classifier,x, y, cv=5).mean())

kf = KFold(n_splits=20, shuffle=False)
print('KFold CrossValScore Using Random Forest %s with 20 splits' % cross_val_score(classifier,x, y, cv=5).mean())

KFold CrossValScore Using Random Forest 0.8369516056083219 with 5 splits
KFold CrossValScore Using Random Forest 0.8369516056083219 with 10 splits
KFold CrossValScore Using Random Forest 0.8369516056083219 with 15 splits
KFold CrossValScore Using Random Forest 0.8369516056083219 with 20 splits


In [31]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# defining parameter range
param_grid = {'penalty' : ['l1', 'l2'],
              'C' : np.logspace(-4, 4, 20),
              'solver' : ['liblinear']}

print(param_grid)

{'penalty': ['l1', 'l2'], 'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]), 'solver': ['liblinear']}


In [32]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(estimator =classifier, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [33]:
# fitting the model for grid search
grid.fit(x_train, y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


GridSearchCV(cv=3, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                         'penalty': ['l1', 'l2'], 'solver': ['liblinear']},
             verbose=2)

In [34]:
# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 4.281332398719396, 'penalty': 'l2', 'solver': 'liblinear'}
LogisticRegression(C=4.281332398719396, solver='liblinear')


In [45]:
# import the builtin time module
import time

# Grab Currrent Time Before Running the Code
start = time.time()

classifier = LogisticRegression(C=4.281332398719396, solver='liblinear')

y_train=y_train.astype('int')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)


# Grab Currrent Time After Running the Code
end = time.time()

#Subtract Start Time from The End Time
total_time = end - start
print("\n"+ str(total_time))


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))


0.0720682144165039
[[43  4]
 [11 42]]
              precision    recall  f1-score   support

           0       0.80      0.91      0.85        47
           1       0.91      0.79      0.85        53

    accuracy                           0.85       100
   macro avg       0.85      0.85      0.85       100
weighted avg       0.86      0.85      0.85       100

0.85
