In [1]:
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    make_scorer
)
from scikitplot.metrics import plot_roc
from scikitplot.metrics import plot_precision_recall
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report 
from sklearn.model_selection import GridSearchCV, cross_val_predict

# Classificazione con Word Embeddings

In questo notebook è mostrata la classificazione dei testi con delle rappresentazioni basate sulle combinazioni delle word embeddings.
Sono stati testati 3 diversi dataset: (sono spiegati nel report e nel file we_dataset_creation.ipynb)
1) Dataset Mean,
2) Dataset Diff,
3) Dataset Min_Dist

## DATASET MEAN

In [2]:
X_train_mean = np.load('../data/we/TED_we_mean.npz')['X_train']
y_train_mean = np.load('../data/we/TED_we_mean.npz')['y_train']
X_test_mean = np.load('../data/we/TED_we_mean.npz')['X_test']
y_test_mean = np.load('../data/we/TED_we_mean.npz')['y_test']

### Model Selection

In [4]:
# cerco gli iperparametri C e max_iter migliori con una 5 fold crossvalidation sul training set.
parameters = {
    'C': [0.01, 0.1, 1, 10, 100],  # Parametro di regolarizzazione
    'penalty': ['l2'],
    'dual': [True],  
    'max_iter': [1000, 2000, 3000],
}

clf = LinearSVC()
gs = GridSearchCV(
    clf, 
    parameters, 
    scoring='accuracy', 
    n_jobs=-1,
    refit=True,
    cv=5
    )
gs.fit(X_train_mean, y_train_mean)
print(gs.best_estimator_)
print(gs.best_params_)
print(gs.best_score_)

LinearSVC(C=10, dual=True)
{'C': 10, 'dual': True, 'max_iter': 1000, 'penalty': 'l2'}
0.509375


### Test

In [5]:
y_pred = gs.best_estimator_.predict(X_test_mean)
print(classification_report(y_test_mean, y_pred))

              precision    recall  f1-score   support

           0       0.47      0.49      0.48       400
           1       0.47      0.45      0.46       400

    accuracy                           0.47       800
   macro avg       0.47      0.47      0.47       800
weighted avg       0.47      0.47      0.47       800



## DATASET DIFF

In [3]:
X_train_diff = np.load('../data/we/TED_we_diff.npz')['X_train']
y_train_diff = np.load('../data/we/TED_we_diff.npz')['y_train']
X_test_diff = np.load('../data/we/TED_we_diff.npz')['X_test']
y_test_diff = np.load('../data/we/TED_we_diff.npz')['y_test']

### Model Selection

In [9]:
# cerco gli iperparametri C e max_iter migliori con una 5 fold crossvalidation sul training set.
parameters = {
    'C': [0.01, 0.1, 1, 10, 100],  # Parametro di regolarizzazione
    'penalty': ['l2'],
    'dual': [True],  
    'max_iter': [1000, 2000, 3000],
}

clf = LinearSVC()
gs = GridSearchCV(
    clf, 
    parameters, 
    scoring='accuracy', 
    n_jobs=-1,
    refit=True,
    cv=5
    )
gs.fit(X_train_diff, y_train_diff)
print(gs.best_estimator_)
print(gs.best_params_)
print(gs.best_score_)

LinearSVC(C=0.01, dual=True)
{'C': 0.01, 'dual': True, 'max_iter': 1000, 'penalty': 'l2'}
0.55975


### Test

In [10]:
y_pred = gs.best_estimator_.predict(X_test_diff)
print(classification_report(y_test_diff, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.56      0.57       400
           1       0.58      0.60      0.59       400

    accuracy                           0.58       800
   macro avg       0.58      0.58      0.58       800
weighted avg       0.58      0.58      0.58       800



## DATASET MIN_DIST

In [4]:
X_train_dist = np.load('../data/we/TED_we_dist.npz')['X_train']
y_train_dist = np.load('../data/we/TED_we_dist.npz')['y_train']
X_test_dist = np.load('../data/we/TED_we_dist.npz')['X_test']
y_test_dist = np.load('../data/we/TED_we_dist.npz')['y_test']

### Model Selection

In [12]:
# cerco gli iperparametri C e max_iter migliori con una 5 fold crossvalidation sul training set.
parameters = {
    'C': [0.01, 0.1, 1, 10, 100],  # Parametro di regolarizzazione
    'penalty': ['l2'],
    'dual': [True],  
    'max_iter': [1000, 2000, 3000],
}

clf = LinearSVC()
gs = GridSearchCV(
    clf, 
    parameters, 
    scoring='accuracy', 
    n_jobs=-1,
    refit=True,
    cv=5
    )
gs.fit(X_train_dist, y_train_dist)
print(gs.best_estimator_)
print(gs.best_params_)
print(gs.best_score_)

LinearSVC(C=0.01, dual=True)
{'C': 0.01, 'dual': True, 'max_iter': 1000, 'penalty': 'l2'}
0.63225


### Test

In [13]:
y_pred = gs.best_estimator_.predict(X_test_dist)
print(classification_report(y_test_dist, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.73      0.66       400
           1       0.66      0.53      0.59       400

    accuracy                           0.63       800
   macro avg       0.63      0.63      0.62       800
weighted avg       0.63      0.63      0.62       800

