In [1]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_recall_fscore_support, roc_auc_score
from sklearn import metrics, preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC

In [2]:
def read_data():

    # Training set
    with open('mex_train.txt', 'r') as f:
        corpus_train = f.readlines()
    with open('mex_train_labels.txt', 'r') as f:
        labels_train = f.readlines()
    labels_train = [int(lab.strip('\n')) for lab in labels_train]
    tweets_train = [tw.strip('\n') for tw in corpus_train]

    # Validation set
    with open('mex_val.txt', 'r') as f:
        corpus_val = f.readlines()
    with open('mex_val_labels.txt', 'r') as f:
        labels_val = f.readlines()
    labels_val = [int(lab.strip('\n')) for lab in labels_val]
    tweets_val = [tw.strip('\n') for tw in corpus_val]

    # Test set
    with open('mex_test.txt', 'r') as f:
        corpus_test = f.readlines()
    with open('mex_test_labels.txt', 'r') as f:
        labels_test = f.readlines()
    labels_test = [int(lab.strip('\n')) for lab in labels_test]
    tweets_test = [tw.strip('\n') for tw in corpus_test]

    return tweets_train, labels_train, tweets_val, labels_val, tweets_test, labels_test

In [129]:
X_train, y_train, X_val, y_val, X_test, y_test = read_data()
print("{0} for training\n{1} for validation\n{2} for test".format(len(X_train), len(X_val), len(X_test)))

5544 for training
616 for validation
1540 for test


In [130]:
vectorizer = CountVectorizer(analyzer = 'char', ngram_range = (4, 7), min_df = 10, max_df = 1000)
X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

In [131]:
X_train.shape

(5544, 27726)

In [132]:
parameters = {'C': [1, 1.5, 2, 2.5], 'gamma': [0.005, 0.01, .05, .1]}
svr = SVC()
grid = GridSearchCV(estimator=svr, param_grid=parameters, n_jobs=8, scoring="f1_macro", cv=5, verbose=3)

In [133]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:  1.2min
[Parallel(n_jobs=8)]: Done  80 out of  80 | elapsed:  6.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=8,
             param_grid={'C': [1, 1.5, 2, 2.5],
                         'gamma': [0.005, 0.01, 0.05, 0.1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=3)

In [134]:
y_pred = grid.predict(X_val)
print(metrics.classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.91      0.85       397
           1       0.78      0.60      0.68       219

    accuracy                           0.80       616
   macro avg       0.79      0.75      0.77       616
weighted avg       0.80      0.80      0.79       616



In [135]:
print(grid.best_estimator_)

SVC(C=2.5, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.005, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [None]:
y_pred_test = grid.predict(X_test)
print(metrics.classification_report(y_test, y_pred_test))

#### Experiments with countVectorizer - ngram word

Baseline:

* SVC with C=3, gamma=0.05, kernel='rbf'
* CountVectorizer with ngram by word(2,2), min_df=1, max_df=2000
* F1-score macro: 0.67

Exp1:

* SVC with C=2, gamma=0.05, kernel='rbf'
* CountVectorizer with ngram by word(1,3), min_df=1, max_df=2000
* F1-score macro: 0.72

Exp2:

* SVC with C=2, gamma=0.05, kernel='rbf'
* CountVectorizer with ngram by word(1,3), min_df=5, max_df=2000
* F1-score macro: 0.78

Exp3:

* SVC with C=2, gamma=0.05, kernel='rbf'
* CountVectorizer with ngram by word(1,3), min_df=8, max_df=2000
* F1-score macro: 0.79
* precision macro: 0.81
* recall macro: 0.78

Exp4:

* SVC with C=2, gamma=0.05, kernel='rbf'
* CountVectorizer with ngram by word(1,3), min_df=10, max_df=2000
* F1-score macro: 0.79
* precision macro: 0.81
* recall macro: 0.78

Exp5:

* SVC with C=2, gamma=0.05, kernel='rbf'
* CountVectorizer with ngram by word(1,3), min_df=8, max_df=2500
* F1-score macro: 0.79
* precision macro: 0.82
* recall macro: 0.78

Exp6:

* SVC with C=2.5, gamma=0.05, kernel='rbf'
* CountVectorizer with ngram by word(1,3), min_df=10, max_df=2500
* F1-score macro: 0.81
* precision macro: 0.83
* recall macro: 0.79

#### Experiments with countVectorizer - ngram char

Exp1:

* SVC with C=2, gamma=0.05, kernel='rbf'
* CountVectorizer with ngram by char(3,7), min_df=10, max_df=2000
* F1-score macro: 0.79
* precision macro: 0.81
* recall macro: 0.78

#### Experiments with Tfi-dfVectorizer