In [1]:
import arff
import gensim
import nltk
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from warnings import filterwarnings
filterwarnings(action='ignore')

In [2]:
with open("data/corpus/OffComBR/OffComBR3.arff") as f:
    dataDictionary = arff.load(f)
    f.close()
    
globoesporte_data = pd.DataFrame(dataDictionary['data'], columns=['toxico', 'comentario'])
globoesporte_data['toxico'] = globoesporte_data['toxico'].apply(lambda x: 1 if x == 'yes' else 0)

In [3]:
globoesporte_data.head(10)

Unnamed: 0,toxico,comentario
0,1,Votaram no PEZAO Agora tomem no CZAO
1,0,cuidado com a poupanca pessoal Lembram o que a...
2,0,Sabe o que eu acho engracado os nossos governa...
3,0,Podiam retirar dos lucros dos bancos
4,0,CADE O GALVAO PRA NARRAR AGORA FALIIIIUUUUUU...
5,0,Nao e possivel que deputados tenham tantos car...
6,0,Sejam honestos aprovem o projeto original vamo...
7,0,Nao a pizza anunciada de anistia do caixa Sim...
8,0,quem ja viu bandidos fazendo leis contra antic...
9,0,Olha os caras fazendo algo de util e de intere...


In [4]:
globoesporte_data[globoesporte_data['toxico'] == 1].head(10)

Unnamed: 0,toxico,comentario
0,1,Votaram no PEZAO Agora tomem no CZAO
25,1,Voces sao idiotas ou se fazem Voces sabem que ...
31,1,Alexandre Acioni vai estudar pra saber o que e...
42,1,Mais um pobre metido a besta ja ja fica sem di...
46,1,Martin Sales deixa de ser idiot agora ninguem...
60,1,nem sou mas esse cara e um velho asqueroso
66,1,A FMI nao manda nas economia mundiais Que vao ...
83,1,PAULO VOCE E BURRO
84,1,VAO BATE PANELASSEUS BURROSBEM FEITO
148,1,ninguem se importa com vc nem sua mae seu nada


# Evaluating different classifiers with different features

In [5]:
# Defining possible parameters for every classifier
SVM = {'name': 'svm',
       'classifier': SVC(),
       'parameters': {'kernel': ['linear'],
       'C': [0.1, 1.0, 5, 10, 15],
       'gamma': [0.01, 0.1, 1.0, 5.0, 10],}}

Logistic = {'name': 'logistic_regression',
            'classifier': LogisticRegression(),
            'parameters': {'penalty': ['l1', 'l2'],
            'C': [0.01, 0.1, 1.0, 5.0, 10],
            'solver': ['liblinear']}}

MLP = {'name': 'multilayer_perceptron',
       'classifier': MLPClassifier(),
       'parameters': {'hidden_layer_sizes':[(100,),(100,100,),(1000,)],
                      'solver': ['lbfgs']}}

Random_Forest = {'name': 'random_forest',
                 'classifier': RandomForestClassifier(),
                 'parameters': {'n_estimators': [10, 50, 100, 300]}}

classifiers = [SVM, Logistic, MLP, Random_Forest]

## Tf - idf

In [6]:
vectorizer = TfidfVectorizer().fit(globoesporte_data['comentario'])
X = vectorizer.transform(globoesporte_data['comentario'])
y = globoesporte_data['toxico']

In [7]:
# Running Grid Search to find the best model for each classifier
models = {}
for classifier in classifiers:
    gs = GridSearchCV(estimator=classifier['classifier'],
                                param_grid=classifier['parameters'],
                                scoring='f1',
                                verbose=0,
                                cv=5,
                                n_jobs=-1)

    gs.fit(X, y)
    models[classifier['name']] = gs.best_estimator_

In [8]:
tf_idf_results = {}
for name, model in models.items():
    tf_idf_results[name] = np.average(cross_val_score(model,
                                            X,
                                            y,
                                            cv=5,
                                            scoring='f1'))
    
for name, result in tf_idf_results.items():
    print(name+':', result)

svm: 0.3454398708635997
logistic_regression: 0.2633626027243049
multilayer_perceptron: 0.4448896426500644
random_forest: 0.29695653366241603


# word embeddings with size = 100

# twitter cbow

In [9]:
# Generating array of tokens for every comment
comments = []
STOPWORDS = nltk.corpus.stopwords.words('portuguese')
for i, comment in globoesporte_data['comentario'].iteritems():
    tokenized_comment = nltk.word_tokenize(comment.lower())
    
    clean_comment = [token for token in tokenized_comment if
                     len(token) > 3 and
                     token not in STOPWORDS and
                     token.isalpha()]
    
    comments.append(clean_comment)

In [10]:
cbow = gensim.models.Word2Vec.load('data/word_embeddings/twitter_cbow_100_5')

In [11]:
# Averaging vectors of tokens into vectors of comments
comments_vectors = []
for comment in comments:
    comment_vector = np.zeros((100))
    n_tokens = 1
    for token in comment:
        n_tokens += 1
        try:
            comment_vector += cbow.wv.get_vector(token)
        except KeyError:
            comment_vector += np.zeros((100))
            
    comments_vectors.append(comment_vector)

In [12]:
X = comments_vectors
y = globoesporte_data['toxico']

In [13]:
# Running Grid Search to find the best model for each classifier
models = {}
for classifier in classifiers:
    gs = GridSearchCV(estimator=classifier['classifier'],
                                param_grid=classifier['parameters'],
                                scoring='f1',
                                verbose=0,
                                cv=5,
                                n_jobs=-1)

    gs.fit(X, y)
    models[classifier['name']] = gs.best_estimator_

In [14]:
twitter_cbow_results = {}
for name, model in models.items():
    twitter_cbow_results[name] = np.average(cross_val_score(model,
                                            X,
                                            y,
                                            cv=5,
                                            scoring='f1'))
    
for name, result in twitter_cbow_results.items():
    print(name+':', result)

svm: 0.3265405279387922
logistic_regression: 0.372000762955991
multilayer_perceptron: 0.36884622972906933
random_forest: 0.34907225754040155


# NILC Cbow

In [15]:
from gensim.models import KeyedVectors
cbow = KeyedVectors.load_word2vec_format('data/word_embeddings/cbow_s100.txt')

In [16]:
comments_vectors = []
for comment in comments:
    comment_vector = np.zeros((100))
    if not comment:
        comments_vectors.append(comment_vector)
    else:
        n_tokens = 0
        for token in comment:
            n_tokens += 1
            try:
                comment_vector += cbow.wv.get_vector(token)
            except KeyError:
                comment_vector += np.zeros((100))
            
        comments_vectors.append(comment_vector/n_tokens)

In [17]:
X = comments_vectors
y = globoesporte_data['toxico']

In [18]:
# Running Grid Search to find the best model for each classifier
models = {}
for classifier in classifiers:
    gs = GridSearchCV(estimator=classifier['classifier'],
                                param_grid=classifier['parameters'],
                                scoring='f1',
                                verbose=0,
                                cv=5,
                                n_jobs=-1)

    gs.fit(X, y)
    models[classifier['name']] = gs.best_estimator_

In [19]:
nilc_cbow_results = {}
for name, model in models.items():
    nilc_cbow_results[name] = np.average(cross_val_score(model,
                                            X,
                                            y,
                                            cv=5,
                                            scoring='f1'))
    
for name, result in nilc_cbow_results.items():
    print(name+':', result)

svm: 0.47301366733624794
logistic_regression: 0.48084587860068967
multilayer_perceptron: 0.401715864695998
random_forest: 0.2912405585146399


# twitter skipgram

In [20]:
skipgram = gensim.models.Word2Vec.load('data/word_embeddings/twitter_skipgram_100_5')

In [21]:
comments_vectors = []
for comment in comments:
    comment_vector = np.zeros((100))
    if not comment:
        comments_vectors.append(comment_vector)
    else:
        n_tokens = 0
        for token in comment:
            n_tokens += 1
            try:
                comment_vector += skipgram.wv.get_vector(token)
            except KeyError:
                comment_vector += np.zeros((100))
            
        comments_vectors.append(comment_vector/n_tokens)

In [22]:
X = comments_vectors
y = globoesporte_data['toxico']

In [23]:
# Running Grid Search to find the best model for each classifier
models = {}
for classifier in classifiers:
    gs = GridSearchCV(estimator=classifier['classifier'],
                                param_grid=classifier['parameters'],
                                scoring='f1',
                                verbose=0,
                                cv=5,
                                n_jobs=-1)

    gs.fit(X, y)
    models[classifier['name']] = gs.best_estimator_

In [24]:
twitter_sg_results = {}
for name, model in models.items():
    twitter_sg_results[name] = np.average(cross_val_score(model,
                                            X,
                                            y,
                                            cv=5,
                                            scoring='f1'))
    
for name, result in twitter_sg_results.items():
    print(name+':', result)

svm: 0.48511671480547386
logistic_regression: 0.5314671981051043
multilayer_perceptron: 0.44203692436588443
random_forest: 0.3719229078161234


# NILC skip-gram

In [25]:
skipgram = KeyedVectors.load_word2vec_format('data/word_embeddings/skip_s100.txt')

In [26]:
comments_vectors = []
for comment in comments:
    comment_vector = np.zeros((100))
    if not comment:
        comments_vectors.append(comment_vector)
    else:
        n_tokens = 0
        for token in comment:
            n_tokens += 1
            try:
                comment_vector += skipgram.wv.get_vector(token)
            except KeyError:
                comment_vector += np.zeros((100))
            
        comments_vectors.append(comment_vector/n_tokens)

In [27]:
X = comments_vectors
y = globoesporte_data['toxico']

In [28]:
# Running Grid Search to find the best model for each classifier
models = {}
for classifier in classifiers:
    gs = GridSearchCV(estimator=classifier['classifier'],
                                param_grid=classifier['parameters'],
                                scoring='f1',
                                verbose=0,
                                cv=5,
                                n_jobs=-1)

    gs.fit(X, y)
    models[classifier['name']] = gs.best_estimator_

In [29]:
nilc_sg_results = {}
for name, model in models.items():
    nilc_sg_results[name] = np.average(cross_val_score(model,
                                            X,
                                            y,
                                            cv=5,
                                            scoring='f1'))
    
for name, result in nilc_sg_results.items():
    print(name+':', result)

svm: 0.48328831030009045
logistic_regression: 0.5070986359786197
multilayer_perceptron: 0.42813713084947896
random_forest: 0.4024322427148882


In [30]:
final_results = [tf_idf_results, twitter_cbow_results,
                 twitter_sg_results, nilc_cbow_results,
                 nilc_sg_results]

In [31]:
df = {}
for result in final_results:
    for name, score in result.items():
        if not df.get(name):
            df[name] = [score]
        else:
            df[name].append(score)
            
pd.DataFrame(df, index=['tf-idf', 'twitter_cbow','twitter_skipgram','nilc_cbow', 'nilc_skipgram'])

Unnamed: 0,svm,logistic_regression,multilayer_perceptron,random_forest
tf-idf,0.34544,0.263363,0.44489,0.296957
twitter_cbow,0.326541,0.372001,0.368846,0.349072
twitter_skipgram,0.485117,0.531467,0.442037,0.371923
nilc_cbow,0.473014,0.480846,0.401716,0.291241
nilc_skipgram,0.483288,0.507099,0.428137,0.402432
