In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.svm import SVC

import numpy as np
import pandas as pd

### IMPORTANTE: correr el archivo Data Processing.ipynb que se encuentra en la carpeta Data/ antes de leer los .csv

In [None]:
train = pd.read_csv("Data/train_processed_tf_idf_bow.csv")
test = pd.read_csv("Data/test_processed_tf_idf_bow.csv")
sub_sample = pd.read_csv("Data/sample_submission.csv")

print (train.shape, test.shape, sub_sample.shape)

## SVD

Como parte del preprocesamiento se escalan los datos aplicando normalizacion y como modelo SVD

In [None]:
features_to_drop = ['id', 'keyword','location','text','location_clean','text_clean', 'hashtags', 'mentions','links']
scaler = StandardScaler()

X_train = train.drop(columns = features_to_drop + ['target'])
X_test = test.drop(columns = features_to_drop)
y_train = train.target

svc = SVC(gamma='auto')

pipeline = Pipeline([('scale',scaler), ('svc', svc),])

pipeline.fit(X_train, y_train)
y_test = pipeline.predict(X_test)

submit = sub_sample.copy()
submit.target = y_test
submit.to_csv('submit_lr.csv',index=False)

In [None]:
print ('Training accuracy: %.4f' % pipeline.score(X_train, y_train))

In [None]:
# F-1 score
print ('Training set f-1 score: %.4f' % f1_score(y_train, pipeline.predict(X_train)))

In [None]:
# Matriz de Confucion
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(y_train, pipeline.predict(X_train)))

## Evaluacion del Modelo

In [None]:
# Cross validation
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=123)
cv_score = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='f1')
print('Cross validation F-1 score: %.3f' %np.mean(cv_score))