# Usando nuevo dataset

# Importando dataset

In [1]:
import jsonlines
import pandas as pd

with jsonlines.open('Dataset/SMHD/SMHD_eating_train.jsonl') as reader:
    count=0
    df_train = pd.DataFrame()
    for obj in reader:
        count=count+1
        lines = reader.read()
        df_inter=pd.json_normalize(lines, record_path=['posts'], meta=['id', 'label'])
        df_train=pd.concat([df_train,df_inter], axis=0)
        
with jsonlines.open('Dataset/SMHD/SMHD_eating_test.jsonl') as reader:
    count=0
    df_test = pd.DataFrame()
    for obj in reader:
        count=count+1
        lines = reader.read()
        df_inter=pd.json_normalize(lines, record_path=['posts'], meta=['id', 'label'])
        df_test=pd.concat([df_test,df_inter], axis=0)

with jsonlines.open('Dataset/SMHD/SMHD_eating_dev.jsonl') as reader:
    count=0
    df_dev = pd.DataFrame()
    for obj in reader:
        count=count+1
        lines = reader.read()
        df_inter=pd.json_normalize(lines, record_path=['posts'], meta=['id', 'label'])
        df_dev=pd.concat([df_dev,df_inter], axis=0)

### Borramos columnas innecesarias

In [2]:
data = pd.concat([df_train, df_test, df_dev])
data.drop(['selftext', 'body', 'created_utc', 'title'], axis=1, inplace=True)

In [3]:
data

Unnamed: 0,text,id,label
0,Added :),2003202,1
1,Added :),2003202,1
2,I lost 80lbs after moving out of my parents ho...,2003202,1
3,I tried adding you again. Let me know if it wo...,2003202,1
4,Added!,2003202,1
...,...,...,...
497,Thanks,1455910,0
498,[removed],1455910,0
499,Woooosh,1455910,0
500,Ipad air 2 - 9.0.1 jailbreak. Should I update?...,1455910,0


# Hacer join de los text de los mismos ids

In [4]:
data_grouped=data.groupby(['id', 'label']).text.apply(list).transform(lambda x : ' '.join(x)).reset_index()

# Tokenizado y eliminacion de stopwords

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

def preprocess(words, type='doc'):
    tokens = nltk.word_tokenize(words)
    porter = nltk.PorterStemmer()
    lemmas = [porter.stem(t) for t in tokens]
    stoplist = stopwords.words('english')
    lemmas_clean = [w for w in tokens if w not in stoplist]
    punctuation = set(string.punctuation)
    words = [w for w in lemmas_clean if  w not in punctuation]
    return words

def preprocesssimple(words, type='doc'):
    stoplist = stopwords.words('english')
    lemmas_clean = [w for w in words if w not in stoplist]
    return words

In [6]:
data_grouped['text'] = data_grouped.apply(lambda row: preprocess(row['text']), axis=1)

# Modelizacion

In [7]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

x = data_grouped['text']
y = data_grouped['label']

vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')

x = vectorizer.fit_transform(x.apply(lambda x: ' '.join(x)))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [8]:
from sklearn import svm

classifier = svm.SVC()
y_train=y_train.astype('int')
classifier.fit(x_train, y_train)

SVC()

In [22]:
y_pred = classifier.predict(x_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[42  5]
 [12 41]]
              precision    recall  f1-score   support

           0       0.78      0.89      0.83        47
           1       0.89      0.77      0.83        53

    accuracy                           0.83       100
   macro avg       0.83      0.83      0.83       100
weighted avg       0.84      0.83      0.83       100

0.83
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.8s
[CV] END ....................C=0.1, gamma=0.0001, kernel=rbf; total time=   1.0s
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=   1.5s
[CV] END ........................C=10, gamma=0.1, kernel=rbf; total time=   1.4s
[CV] END .........................C=100, gamma=1, kernel=rbf; total time=   2.3s
[CV] END ......................C=1000, gamma=0.1, kernel=rbf; total time=   1.1s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   1.6s
[CV] END ......................C=1, gamma=0.0001, kernel=rbf; total time=   1.6s
[

# KFold CrossValScore

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score

In [11]:
kf = KFold(n_splits=10, shuffle=False)
print('KFold CrossValScore Using Random Forest %s' % cross_val_score(classifier,x, y, cv=5).mean())

KFold CrossValScore Using Random Forest 0.8278154681139757


In [13]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

print(param_grid)

{'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}


In [16]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(estimator =classifier, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [19]:
# fitting the model for grid search
grid.fit(x_train, y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


GridSearchCV(cv=3, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=2)

In [20]:
# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
SVC(C=100, gamma=0.01)


In [29]:
# import the builtin time module
import time

# Grab Currrent Time Before Running the Code
start = time.time()


classifier = svm.SVC(C=100, gamma=0.01)

y_train=y_train.astype('int')
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

# Grab Currrent Time After Running the Code
end = time.time()

#Subtract Start Time from The End Time
total_time = end - start
print("\n"+ str(total_time))

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))


0.8552811145782471
[[42  5]
 [12 41]]
              precision    recall  f1-score   support

           0       0.78      0.89      0.83        47
           1       0.89      0.77      0.83        53

    accuracy                           0.83       100
   macro avg       0.83      0.83      0.83       100
weighted avg       0.84      0.83      0.83       100

0.83
