# todo: generate bigrams to evaluate models

In [79]:
import arff
import gensim
import nltk
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import f1_score, confusion_matrix, make_scorer

from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from warnings import filterwarnings
filterwarnings(action='ignore')

In [80]:
data = pd.read_excel('data/youtube_text.xlsx')

In [86]:
data = data.dropna()

In [87]:
data

Unnamed: 0.1,Unnamed: 0,comentario,toxico
0,27609,O povo que foi as ruas devem permanecerem unid...,0
1,44934,"Concordo contigo, porém, é sempre bom lembrar ...",0
2,19854,Henry Bugalho maior lixo do YouTube. Mau cará...,1
3,7392,Essa cidadã do PSOL É A MAIOR CRETINA EM SOLO ...,1
4,3547,"Rapa no zero, deixa a barbona de viking, malha...",0
5,16917,Mamãe chupei .... por quê não se revoltam conF...,1
6,44405,TNC mbl,1
7,62222,nego reclamando que nao teve repercussao no ES...,1
8,42319,Pior que é vdd ! Minha mãe até ontem estava ac...,0
9,74672,"Mas o ""hacker"" não era o russo?",0


In [88]:
with open("data/corpus/OffComBR/OffComBR3.arff") as f:
    dataDictionary = arff.load(f)
    f.close()
    
globoesporte_data = pd.DataFrame(dataDictionary['data'], columns=['toxico', 'comentario'])
globoesporte_data['toxico'] = globoesporte_data['toxico'].apply(lambda x: 1 if x == 'yes' else 0)

In [89]:
globoesporte_data.head(10)

Unnamed: 0,toxico,comentario
0,1,Votaram no PEZAO Agora tomem no CZAO
1,0,cuidado com a poupanca pessoal Lembram o que a...
2,0,Sabe o que eu acho engracado os nossos governa...
3,0,Podiam retirar dos lucros dos bancos
4,0,CADE O GALVAO PRA NARRAR AGORA FALIIIIUUUUUU...
5,0,Nao e possivel que deputados tenham tantos car...
6,0,Sejam honestos aprovem o projeto original vamo...
7,0,Nao a pizza anunciada de anistia do caixa Sim...
8,0,quem ja viu bandidos fazendo leis contra antic...
9,0,Olha os caras fazendo algo de util e de intere...


In [90]:
globoesporte_data[globoesporte_data['toxico'] == 1].head(10)

Unnamed: 0,toxico,comentario
0,1,Votaram no PEZAO Agora tomem no CZAO
25,1,Voces sao idiotas ou se fazem Voces sabem que ...
31,1,Alexandre Acioni vai estudar pra saber o que e...
42,1,Mais um pobre metido a besta ja ja fica sem di...
46,1,Martin Sales deixa de ser idiot agora ninguem...
60,1,nem sou mas esse cara e um velho asqueroso
66,1,A FMI nao manda nas economia mundiais Que vao ...
83,1,PAULO VOCE E BURRO
84,1,VAO BATE PANELASSEUS BURROSBEM FEITO
148,1,ninguem se importa com vc nem sua mae seu nada


# Evaluating different classifiers with different features

# Bag of words (baseline approach)

In [91]:
# Defining possible parameters for every classifier
SVM = {'name': 'svm',
       'classifier': SVC(),
       'parameters': {'kernel': ['linear', 'rbf'],
       'C': [1.0, 5, 10, 15],
       'gamma': [0.1, 0.5, 1.0, 5.0, 10]}}

Logistic = {'name': 'logistic_regression',
            'classifier': LogisticRegression(),
            'parameters': {'penalty': ['l1', 'l2'],
            'C': [0.01, 0.1, 1.0, 5.0, 10],
            'max_iter': [100, 500, 1000],
            'solver': ['liblinear']}}

MLP = {'name': 'multilayer_perceptron',
       'classifier': MLPClassifier(),
       'parameters': {'hidden_layer_sizes':[(100,),(1000,),(100,100)],
                      'solver': ['lbfgs']}}

Random_Forest = {'name': 'random_forest',
                 'classifier': RandomForestClassifier(),
                 'parameters': {'n_estimators': [10, 50, 100, 300, 'warn']}}

classifiers = [SVM, Logistic, MLP, Random_Forest]

In [6]:
X = CountVectorizer().fit_transform(globoesporte_data['comentario'])
y = globoesporte_data['toxico']

In [7]:
# Running Grid Search to find the best model for each classifier
models = {}
for classifier in classifiers:
    gs = GridSearchCV(estimator=classifier['classifier'],
                                param_grid=classifier['parameters'],
                                scoring='f1',
                                verbose=True,
                                cv=5,
                                n_jobs=-1)

    gs.fit(X, y)
    models[classifier['name']] = gs.best_estimator_

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    9.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    2.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   48.4s finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  25 | elapsed:    3.2s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    7.1s finished


In [10]:
bow_results = {}
for name, model in models.items():
    bow_results[name] = np.average(cross_val_score(model,
                                            X,
                                            y,
                                            cv=5,
                                            scoring=make_scorer(f1_score, average='weighted')))
    
for name, result in bow_results.items():
    print(name+':', result)

svm: 0.7641084696861176
logistic_regression: 0.7824183024958901
multilayer_perceptron: 0.7624698614065534
random_forest: 0.7751969749123615


In [11]:
bow_confusion_matrix = {}
for name, model in models.items():
    y_pred = cross_val_predict(model, X, y, cv=5)
    bow_confusion_matrix[name] = confusion_matrix(y, y_pred)
    print(name+'\n',bow_confusion_matrix[name])

svm
 [[719 112]
 [128  74]]
logistic_regression
 [[775  56]
 [145  57]]
multilayer_perceptron
 [[703 128]
 [121  81]]
random_forest
 [[788  43]
 [156  46]]


## Tf - idf

In [12]:
# Defining possible parameters for every classifier
SVM = {'name': 'svm',
       'classifier': SVC(),
       'parameters': {'kernel': ['linear', 'rbf'],
       'C': [1.0, 5, 10, 15],
       'gamma': [0.1, 0.5, 1.0, 5.0, 10]}}

Logistic = {'name': 'logistic_regression',
            'classifier': LogisticRegression(),
            'parameters': {'penalty': ['l1', 'l2'],
            'C': [0.01, 0.1, 1.0, 5.0, 10],
            'max_iter': [100, 500, 1000],
            'solver': ['liblinear']}}

MLP = {'name': 'multilayer_perceptron',
       'classifier': MLPClassifier(),
       'parameters': {'hidden_layer_sizes':[(100,),(1000,),(100,100)],
                      'solver': ['lbfgs']}}

Random_Forest = {'name': 'random_forest',
                 'classifier': RandomForestClassifier(),
                 'parameters': {'n_estimators': [10, 50, 100, 300, 'warn']}}

classifiers = [SVM, Logistic, MLP, Random_Forest]

In [13]:
vectorizer = TfidfVectorizer().fit(globoesporte_data['comentario'])
X = vectorizer.transform(globoesporte_data['comentario'])
y = globoesporte_data['toxico']

In [14]:
# Running Grid Search to find the best model for each classifier
models = {}
for classifier in classifiers:
    gs = GridSearchCV(estimator=classifier['classifier'],
                                param_grid=classifier['parameters'],
                                scoring='f1',
                                verbose=0,
                                cv=5,
                                n_jobs=-1)

    gs.fit(X, y)
    models[classifier['name']] = gs.best_estimator_

In [15]:
tf_idf_results = {}
for name, model in models.items():
    tf_idf_results[name] = np.average(cross_val_score(model,
                                            X,
                                            y,
                                            cv=5,
                                            scoring=make_scorer(f1_score, average='weighted')))
    
for name, result in tf_idf_results.items():
    print(name+':', result)

svm: 0.7821448858374002
logistic_regression: 0.7703214605694468
multilayer_perceptron: 0.7706355273869947
random_forest: 0.7629019537879375


In [16]:
tf_idf_confusion_matrix = {}
for name, model in models.items():
    y_pred = cross_val_predict(model, X, y, cv=5)
    tf_idf_confusion_matrix[name] = confusion_matrix(y, y_pred)
    print(name+'\n',tf_idf_confusion_matrix[name])

svm
 [[783  48]
 [149  53]]
logistic_regression
 [[806  25]
 [167  35]]
multilayer_perceptron
 [[731 100]
 [134  68]]
random_forest
 [[780  51]
 [161  41]]


# word embeddings with size = 100

# twitter cbow

In [92]:
# Defining possible parameters for every classifier
SVM = {'name': 'svm',
       'classifier': SVC(),
       'parameters': {'kernel': ['linear', 'rbf'],
       'C': [1.0, 5, 10, 15],
       'gamma': [0.1, 0.5, 1.0, 5.0, 10]}}

Logistic = {'name': 'logistic_regression',
            'classifier': LogisticRegression(),
            'parameters': {'penalty': ['l1', 'l2'],
            'C': [0.01, 0.1, 1.0, 5.0, 10],
            'max_iter': [100, 500, 1000],
            'solver': ['liblinear']}}

MLP = {'name': 'multilayer_perceptron',
       'classifier': MLPClassifier(),
       'parameters': {'hidden_layer_sizes':[(100,),(1000,),(100,100)],
                      'solver': ['lbfgs']}}

Random_Forest = {'name': 'random_forest',
                 'classifier': RandomForestClassifier(),
                 'parameters': {'n_estimators': [10, 50, 100, 300, 'warn']}}

classifiers = [SVM, Logistic, MLP, Random_Forest]

In [93]:
# Generating array of tokens for every comment
comments = []
STOPWORDS = nltk.corpus.stopwords.words('portuguese')
for i, comment in globoesporte_data['comentario'].iteritems():
    tokenized_comment = nltk.word_tokenize(comment.lower())
    
    clean_comment = [token for token in tokenized_comment if
                     len(token) > 3 and
                     token not in STOPWORDS and
                     token.isalpha()]
    
    comments.append(clean_comment)

In [94]:
twitter_cbow = gensim.models.Word2Vec.load('data/word_embeddings/twitter_cbow_100_5')

In [95]:
# Averaging vectors of tokens into vectors of comments
not_found = []
found = 0
comments_vectors = []
for comment in comments:
    comment_vector = np.zeros((100))
    n_tokens = 1
    for token in comment:
        n_tokens += 1
        try:
            comment_vector += twitter_cbow.wv.get_vector(token)
            found += 1
        except KeyError:
            comment_vector += np.zeros((100))
            not_found.append(token)
            
    comments_vectors.append(comment_vector/n_tokens)

In [96]:
X = comments_vectors
y = globoesporte_data['toxico']

In [97]:
# Running Grid Search to find the best model for each classifier
models = {}
for classifier in classifiers:
    gs = GridSearchCV(estimator=classifier['classifier'],
                                param_grid=classifier['parameters'],
                                scoring='f1',
                                verbose=True,
                                cv=5,
                                n_jobs=-1)

    gs.fit(X, y)
    models[classifier['name']] = gs.best_estimator_

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   23.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    5.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   28.3s finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    8.6s finished


In [98]:
twitter_cbow_results = {}
for name, model in models.items():
    twitter_cbow_results[name] = np.average(cross_val_score(model,
                                            X,
                                            y,
                                            cv=5,
                                            scoring=make_scorer(f1_score, average='weighted')))
    
for name, result in twitter_cbow_results.items():
    print(name+':', result)   

svm: 0.8035218360089497
logistic_regression: 0.7989817911050029
multilayer_perceptron: 0.7427340815701806
random_forest: 0.7875912494377239


In [99]:
twitter_cbow_confusion_matrix = {}
for name, model in models.items():
    y_pred = cross_val_predict(model, X, y, cv=5)
    twitter_cbow_confusion_matrix[name] = confusion_matrix(y, y_pred)
    print(name+'\n',twitter_cbow_confusion_matrix[name])

svm
 [[771  60]
 [126  76]]
logistic_regression
 [[747  84]
 [114  88]]
multilayer_perceptron
 [[692 139]
 [119  83]]
random_forest
 [[809  22]
 [161  41]]


# NILC Cbow

In [25]:
# Defining possible parameters for every classifier
SVM = {'name': 'svm',
       'classifier': SVC(),
       'parameters': {'kernel': ['linear', 'rbf'],
       'C': [1.0, 5, 10, 15],
       'gamma': [0.1, 0.5, 1.0, 5.0, 10]}}

Logistic = {'name': 'logistic_regression',
            'classifier': LogisticRegression(),
            'parameters': {'penalty': ['l1', 'l2'],
            'C': [0.01, 0.1, 1.0, 5.0, 10],
            'max_iter': [100, 500, 1000],
            'solver': ['liblinear']}}

MLP = {'name': 'multilayer_perceptron',
       'classifier': MLPClassifier(),
       'parameters': {'hidden_layer_sizes':[(100,),(1000,),(100,100)],
                      'solver': ['lbfgs']}}

Random_Forest = {'name': 'random_forest',
                 'classifier': RandomForestClassifier(),
                 'parameters': {'n_estimators': [10, 50, 100, 300, 'warn']}}

classifiers = [SVM, Logistic, MLP, Random_Forest]

In [26]:
from gensim.models import KeyedVectors
nilc_cbow = KeyedVectors.load_word2vec_format('data/word_embeddings/cbow_s100.txt')

In [27]:
# Averaging vectors of tokens into vectors of comments
not_found = []
found = 0
comments_vectors = []
for comment in comments:
    comment_vector = np.zeros((100))
    n_tokens = 1
    for token in comment:
        n_tokens += 1
        try:
            comment_vector += nilc_cbow.wv.get_vector(token)
            found += 1
        except KeyError:
            comment_vector += np.zeros((100))
            not_found.append(token)
            
    comments_vectors.append(comment_vector/n_tokens)

In [28]:
X = comments_vectors
y = globoesporte_data['toxico']

In [29]:
# Running Grid Search to find the best model for each classifier
models = {}
for classifier in classifiers:
    gs = GridSearchCV(estimator=classifier['classifier'],
                                param_grid=classifier['parameters'],
                                scoring='f1',
                                verbose=0,
                                cv=5,
                                n_jobs=-1)

    gs.fit(X, y)
    models[classifier['name']] = gs.best_estimator_

In [30]:
nilc_cbow_results = {}
for name, model in models.items():
    nilc_cbow_results[name] = np.average(cross_val_score(model,
                                            X,
                                            y,
                                            cv=5,
                                            scoring=make_scorer(f1_score, average='weighted')))
    
for name, result in nilc_cbow_results.items():
    print(name+':', result)

svm: 0.8207724462750393
logistic_regression: 0.8181598602562492
multilayer_perceptron: 0.7648927828816331
random_forest: 0.7876024395669415


In [31]:
nilc_cbow_confusion_matrix = {}
for name, model in models.items():
    y_pred = cross_val_predict(model, X, y, cv=5)
    nilc_cbow_confusion_matrix[name] = confusion_matrix(y, y_pred)
    print(name+'\n',nilc_cbow_confusion_matrix[name])

svm
 [[771  60]
 [114  88]]
logistic_regression
 [[772  59]
 [117  85]]
multilayer_perceptron
 [[707 124]
 [113  89]]
random_forest
 [[818  13]
 [162  40]]


# twitter skipgram

In [100]:
# Defining possible parameters for every classifier
SVM = {'name': 'svm',
       'classifier': SVC(),
       'parameters': {'kernel': ['linear', 'rbf'],
       'C': [1.0, 5, 10, 15],
       'gamma': [0.1, 0.5, 1.0, 5.0, 10]}}

Logistic = {'name': 'logistic_regression',
            'classifier': LogisticRegression(),
            'parameters': {'penalty': ['l1', 'l2'],
            'C': [0.01, 0.1, 1.0, 5.0, 10],
            'max_iter': [100, 500, 1000],
            'solver': ['liblinear']}}

MLP = {'name': 'multilayer_perceptron',
       'classifier': MLPClassifier(),
       'parameters': {'hidden_layer_sizes':[(100,),(1000,),(100,100)],
                      'solver': ['lbfgs']}}

Random_Forest = {'name': 'random_forest',
                 'classifier': RandomForestClassifier(),
                 'parameters': {'n_estimators': [10, 50, 100, 300, 'warn']}}

classifiers = [SVM, Logistic, MLP, Random_Forest]

In [101]:
twitter_skipgram = gensim.models.Word2Vec.load('data/word_embeddings/twitter_skipgram_100_5')

In [102]:
comments_vectors = []
for comment in comments:
    comment_vector = np.zeros((100))
    if not comment:
        comments_vectors.append(comment_vector)
    else:
        n_tokens = 0
        for token in comment:
            n_tokens += 1
            try:
                comment_vector += twitter_skipgram.wv.get_vector(token)
            except KeyError:
                comment_vector += np.zeros((100))
            
        comments_vectors.append(comment_vector/n_tokens)

In [103]:
X = comments_vectors
y = globoesporte_data['toxico']

In [104]:
# Running Grid Search to find the best model for each classifier
models = {}
for classifier in classifiers:
    gs = GridSearchCV(estimator=classifier['classifier'],
                                param_grid=classifier['parameters'],
                                scoring='f1',
                                verbose=2,
                                cv=5,
                                n_jobs=-1)

    gs.fit(X, y)
    models[classifier['name']] = gs.best_estimator_

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   22.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    8.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   18.4s finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    9.8s finished


In [37]:
twitter_sg_results = {}
for name, model in models.items():
    twitter_sg_results[name] = np.average(cross_val_score(model,
                                            X,
                                            y,
                                            cv=5,
                                            scoring=make_scorer(f1_score, average='weighted')))
    
for name, result in twitter_sg_results.items():
    print(name+':', result)

svm: 0.82062913386935
logistic_regression: 0.8301687738470237
multilayer_perceptron: 0.7702263025770355
random_forest: 0.7691131811709627


In [38]:
twitter_sg_confusion_matrix = {}
for name, model in models.items():
    y_pred = cross_val_predict(model, X, y, cv=5)
    twitter_sg_confusion_matrix[name] = confusion_matrix(y, y_pred)
    print(name+'\n',twitter_sg_confusion_matrix[name])

svm
 [[786  45]
 [122  80]]
logistic_regression
 [[775  56]
 [109  93]]
multilayer_perceptron
 [[688 143]
 [108  94]]
random_forest
 [[794  37]
 [154  48]]


# NILC skip-gram

In [39]:
# Defining possible parameters for every classifier
SVM = {'name': 'svm',
       'classifier': SVC(),
       'parameters': {'kernel': ['linear', 'rbf'],
       'C': [1.0, 5, 10, 15],
       'gamma': [0.1, 0.5, 1.0, 5.0, 10]}}

Logistic = {'name': 'logistic_regression',
            'classifier': LogisticRegression(),
            'parameters': {'penalty': ['l1', 'l2'],
            'C': [0.01, 0.1, 1.0, 5.0, 10],
            'max_iter': [100, 500, 1000],
            'solver': ['liblinear']}}

MLP = {'name': 'multilayer_perceptron',
       'classifier': MLPClassifier(),
       'parameters': {'hidden_layer_sizes':[(100,),(1000,),(100,100)],
                      'solver': ['lbfgs']}}

Random_Forest = {'name': 'random_forest',
                 'classifier': RandomForestClassifier(),
                 'parameters': {'n_estimators': [10, 50, 100, 300, 'warn']}}

classifiers = [SVM, Logistic, MLP, Random_Forest]

In [40]:
nilc_skipgram = KeyedVectors.load_word2vec_format('data/word_embeddings/skip_s100.txt')
twitter_skipgram = gensim.models.Word2Vec.load('data/word_embeddings/twitter_skipgram_100_5')

In [41]:
comments_vectors = []
for comment in comments:
    comment_vector = np.zeros((100))
    if not comment:
        comments_vectors.append(comment_vector)
    else:
        n_tokens = 0
        for token in comment:
            n_tokens += 1
            try:
                comment_vector += nilc_skipgram.wv.get_vector(token)
            except KeyError:
                comment_vector += np.zeros((100))
            
        comments_vectors.append(comment_vector/n_tokens)

In [42]:
X = comments_vectors
y = globoesporte_data['toxico']

In [43]:
# Running Grid Search to find the best model for each classifier
models = {}
for classifier in classifiers:
    gs = GridSearchCV(estimator=classifier['classifier'],
                                param_grid=classifier['parameters'],
                                scoring='f1',
                                verbose=2,
                                cv=5,
                                n_jobs=-1)

    gs.fit(X, y)
    models[classifier['name']] = gs.best_estimator_

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    8.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    3.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   11.7s finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  25 | elapsed:    2.9s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    5.7s finished


In [44]:
nilc_sg_results = {}
for name, model in models.items():
    nilc_sg_results[name] = np.average(cross_val_score(model,
                                            X,
                                            y,
                                            cv=5,
                                            scoring=make_scorer(f1_score, average='weighted')))
    
for name, result in nilc_sg_results.items():
    print(name+':', result)

svm: 0.8267288446548646
logistic_regression: 0.823001397999788
multilayer_perceptron: 0.7950640926230416
random_forest: 0.8031388320447483


In [45]:
nilc_sg_confusion_matrix = {}
for name, model in models.items():
    y_pred = cross_val_predict(model, X, y, cv=5)
    nilc_sg_confusion_matrix[name] = confusion_matrix(y, y_pred)
    print(name+'\n',nilc_sg_confusion_matrix[name])

svm
 [[799  32]
 [126  76]]
logistic_regression
 [[771  60]
 [112  90]]
multilayer_perceptron
 [[723 108]
 [112  90]]
random_forest
 [[806  25]
 [144  58]]


In [46]:
final_results = [bow_results, tf_idf_results, twitter_cbow_results,
                 twitter_sg_results, nilc_cbow_results, nilc_sg_results]

final_matrices = {'bow baseline': bow_confusion_matrix, 
                  'tf idf':tf_idf_confusion_matrix,
                  'twitter cbow':twitter_cbow_confusion_matrix,
                  'twitter skipgram':twitter_sg_confusion_matrix,
                  'nilc cbow':nilc_cbow_confusion_matrix,
                  'nilc skipgram':nilc_sg_confusion_matrix}

In [47]:
df = {}
for result in final_results:
    for name, score in result.items():
        if not df.get(name):
            df[name] = [score]
        else:
            df[name].append(score)
            
pd.DataFrame(df, index=['bow baseline', 'tf idf', 'twitter cbow',
                        'twitter skipgram', 'nilc cbow', 'nilc skipgram'])

Unnamed: 0,svm,logistic_regression,multilayer_perceptron,random_forest
bow baseline,0.764108,0.782418,0.76247,0.775197
tf idf,0.782145,0.770321,0.770636,0.762902
twitter cbow,0.80694,0.813692,0.747183,0.780117
twitter skipgram,0.820629,0.830169,0.770226,0.769113
nilc cbow,0.820772,0.81816,0.764893,0.787602
nilc skipgram,0.826729,0.823001,0.795064,0.803139


In [48]:
for feature_name, feature in final_matrices.items():
    print(feature_name)
    for name, matrix in feature.items():
        print(name)
        print(pd.DataFrame(matrix, columns=['0 pred', '1 pred'], index=['0 true', '1 true']))
        print()

bow baseline
svm
        0 pred  1 pred
0 true     719     112
1 true     128      74

logistic_regression
        0 pred  1 pred
0 true     775      56
1 true     145      57

multilayer_perceptron
        0 pred  1 pred
0 true     703     128
1 true     121      81

random_forest
        0 pred  1 pred
0 true     788      43
1 true     156      46

tf idf
svm
        0 pred  1 pred
0 true     783      48
1 true     149      53

logistic_regression
        0 pred  1 pred
0 true     806      25
1 true     167      35

multilayer_perceptron
        0 pred  1 pred
0 true     731     100
1 true     134      68

random_forest
        0 pred  1 pred
0 true     780      51
1 true     161      41

twitter cbow
svm
        0 pred  1 pred
0 true     765      66
1 true     120      82

logistic_regression
        0 pred  1 pred
0 true     765      66
1 true     115      87

multilayer_perceptron
        0 pred  1 pred
0 true     691     140
1 true     125      77

random_forest
        0 pred  1