In [None]:
import math
import os
import pickle
import re

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, precision_recall_fscore_support)
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report

In [None]:
import nltk
nltk.download('stopwords')
  

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
default_dir = "/data/"
data_tcc_pos_neg = default_dir+'labeled_data/dataset_label_pos_neg.csv'
data_neg_emotions = default_dir+'labeled_data/plutchik_other_emotions.csv'


In [None]:
data = pd.read_csv(data_tcc_pos_neg)
data['type'] = 'with_theme-' + data['sentiment']
data = data.dropna()
data.head()

Unnamed: 0,date,tweet_text,sentiment,clean_text,type
0,2021-02-27 22:22:07+00:00,Os nossos vizinhos com medidas restritivas:\n\...,Negativo,vizinhos medidas restritivas flávio dino,with_theme-Negativo
1,2021-02-27 22:21:02+00:00,@Su_eline Enquanto as pessoas não acordarem pa...,Negativo,enquanto pessoas acordar gravidade situação ac...,with_theme-Negativo
2,2021-02-27 22:04:55+00:00,2de2\nE demorou muito para adotar medidas REST...,Negativo,demorar adotar medidas restritivas severas fic...,with_theme-Negativo
3,2021-02-27 22:00:52+00:00,"@JanainaDoBrasil 🤣🤣🤣😂😅\nQue lógica bisonha!\n""...",Positivo,lógica bisonha vamos esperar todas vagas ser o...,with_theme-Positivo
4,2021-02-27 20:13:55+00:00,😷👏👏👏Setor produtivo apoia medidas restritivas ...,Negativo,setor produtivo apoiar medidas restritivas des...,with_theme-Negativo


In [None]:
stemmer = nltk.stem.snowball.PortugueseStemmer()
analyzer = TfidfVectorizer().build_analyzer()

In [None]:
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc) if w[0]!='@')

In [None]:
def create_splits(data):
    test_validation_size = int(0.20*data.shape[0])
    train, test = train_test_split(data, test_size=test_validation_size, random_state=42, stratify=data['type'])
    return train, test

  
train, test = create_splits(data)
print('Training samples:  ', train.shape[0])
print('Test samples:      ', test.shape[0])



In [None]:
vectorizer = TfidfVectorizer(
    stop_words=nltk.corpus.stopwords.words('portuguese'), 
    analyzer=stemmed_words,
    min_df=0.0001, 
    max_features=100000, 
    max_df=0.8)

In [None]:
X_train = vectorizer.fit_transform(train['clean_text'].values.astype('U'))
X_test = vectorizer.transform(test['clean_text'].values.astype('U'))

In [None]:
labels = {
    'Positivo' : 0,
    'Negativo' : 1,
}

# encoding = {
#     'tristeza': 0,
#     'medo': 1,
#     'raiva': 2,
#     'desprezo' : 3,
# }

y_train = train['sentiment'].map(labels).values
y_test = test['sentiment'].map(labels).values

In [None]:
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
prediction_linear = classifier_linear.predict(X_test)

In [None]:
print(classification_report(y_test, prediction_linear))


In [None]:
ax = sns.heatmap(confusion_matrix(y_test, prediction_linear), cmap='Greens_r', annot=True, fmt='d')
_ = ax.set(xlabel='Previsto', ylabel='Correto', title='SVM')

In [None]:
#Calculando a acurácia
acc = accuracy_score(prediction_linear, y_test)
precision, recall, fscore = precision_recall_fscore_support(prediction_linear, y_test, average='micro')
print(f'Acurácia: {acc}')
print(f'\nPrecisao: {precision}, Recall: {recall} FScore: {fscore}')