In [None]:
!python3 -m spacy download pt_core_news_sm

In [None]:
# Importando bibliotecas
import arff
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from spacy.lang.pt.stop_words import STOP_WORDS
from spacy.lang.pt import Portuguese

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
### Criando dataset de treino e de teste ###
dataset_number = 1

# Lacafe
text = 'txt'
label = 'has_anger'
train_data = pd.read_csv(f'./Datasets/Lacafe/df_dataset_train_{dataset_number}.csv')
test_data = pd.read_csv(f'./Datasets/Lacafe/df_dataset_test_{dataset_number}.csv')

# Fortuna (Updated)
# text = 'text'
# label = 'hatespeech_comb'
# train_data = pd.read_csv(f'./Datasets/FortunaUpdated/2019-05-28_portuguese_hate_speech_binary_classification_train_{dataset_number}.csv')
# test_data = pd.read_csv(f'./Datasets/FortunaUpdated/2019-05-28_portuguese_hate_speech_binary_classification_test_{dataset_number}.csv')

# OffComBr
# text = 'text'
# label = 'offensive'
# train_data = pd.read_csv(f'./Datasets/OffComBR/OffComBR2_train_{dataset_number}.csv')
# test_data = pd.read_csv(f'./Datasets/OffComBR/OffComBR2_test_{dataset_number}.csv')

# HateBR
# text = 'instagram_comments'
# label = 'offensive_language'
# train_data = pd.read_csv(f'./Datasets/HateBR/HateBR_train_{dataset_number}.csv')
# test_data = pd.read_csv(f'./Datasets/HateBR/HateBR_test_{dataset_number}.csv')

In [None]:
# Word tokenization
import spacy
import string

# Criando a lista de pontuações
punctuations = string.punctuation

# Criando a lista de stop words
nlp = spacy.load('pt_core_news_sm')
stop_words = spacy.lang.pt.stop_words.STOP_WORDS

# Criando a função de tokenização
def spacy_tokenizer(sentence, lemmatize=False, remove_stop_words=False, remove_punctuations=False):
    # Criando a lista de tokens
    mytokens = nlp(sentence)

    # Lematizando os tokens e colocando em caixa baixa
    if (lemmatize):
        mytokens = [ word.lemma_.lower().strip() for word in mytokens ]
    else:
        mytokens = [ word.text.lower().strip() for word in mytokens ]

    # Removendo stop words
    if (remove_stop_words):
        mytokens = [ word for word in mytokens if word not in stop_words ]        
        
    if (remove_punctuations):
        mytokens = [ word for word in mytokens if word not in punctuations ]
        
    # Retornando a lista de token reprocessado
    return mytokens

text_pipeline = lambda x: spacy_tokenizer(x, lemmatize=True, remove_stop_words=True)
label_pipeline = lambda x: int(x)

In [None]:
# Tokenizando, transformando para lemas e botando em caixa baixa cada tweet
train_data[text] = [text_pipeline(str(txt)) for txt in train_data[text]]
test_data[text] = [text_pipeline(str(txt)) for txt in test_data[text]]

train_data.head()

In [None]:
# Juntando os tokens de volta em um única string
train_data[text] = train_data.apply(lambda x: " ".join(x[text]), axis=1)
test_data[text] = test_data.apply(lambda x: " ".join(x[text]), axis=1)

train_data.head()

In [None]:
# Separando o treino e o teste em dois conjuntos diferentes
x_train = train_data[text]
y_train = train_data[label]
x_test = test_data[text]
y_test = test_data[label]

In [None]:
# Convertendo em BOW com valoração de frequência
freq_vector = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(train_data[text])

x_train = freq_vector.transform(x_train)
x_test = freq_vector.transform(x_test)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
# Logistic Regression Classifier sem Cross Validation
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=500)

# model generation
classifier.fit(x_train, y_train)

In [None]:
# Resultados do dataset de treino

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

y_pred_train = classifier.predict(x_train)
metrics = precision_recall_fscore_support(y_train, y_pred_train, average='macro')
accuracy = accuracy_score(y_train, y_pred_train)

print('---- Dataset de treino ----')
print(f'Accuracy: {str(accuracy * 100).replace(".", ",")}%')
print(f'Precision: {str(metrics[0] * 100).replace(".", ",")}%')
print(f'Recall: {str(metrics[1] * 100).replace(".", ",")}%')
print(f'Fscore: {str(metrics[2] * 100).replace(".", ",")}%')

In [None]:
# Resultados do dataset de teste

y_pred = classifier.predict(x_test)
metrics = precision_recall_fscore_support(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)

print('---- Dataset de teste ----')
print(f'Accuracy: {str(accuracy * 100).replace(".", ",")}%')
print(f'Precision: {str(metrics[0] * 100).replace(".", ",")}%')
print(f'Recall: {str(metrics[1] * 100).replace(".", ",")}%')
print(f'Fscore: {str(metrics[2] * 100).replace(".", ",")}%')

In [None]:
# Plotando matriz de confusão de teste

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

def plot_cm(conf_matrix):
  sns.set(font_scale=1.4,color_codes=True,palette="deep")
  sns.heatmap(cm,annot=True,annot_kws={"size":16},fmt="d",cmap="YlGnBu")
  plt.title("Confusion Matrix")
  plt.xlabel("Predicted Value")
  plt.ylabel("True Value")

plot_cm(cm)