In [1]:
import numpy as np
import scipy.linalg as la
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import matplotlib.cm as cm

import sys
import time 

In [2]:
# Opening the file in read mode
with open("assets/positive.txt", 'r', encoding='utf-8') as file:
    # Read all the lines of the file into a list
    positive_sentences = file.readlines()

with open("assets/negative.txt", 'r', encoding='utf-8') as file:
    # Read all the lines of the file into a list
    negative_sentences = file.readlines()

sentences = positive_sentences + negative_sentences


feelings = [1]*len(positive_sentences) + [-1]*len(negative_sentences)

In [3]:
# Import Text Libraries
#-----------------------------------

# Work with regular expressions
import re

# Gonna use the NLTK librarie and spaCy to pre-process texts

import nltk # Natural Language Toolkit
import spacy # Industrial-Strength Natural Language Processing

from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
swords = nltk.corpus.stopwords.words('portuguese')
swords.remove("não")

# spaCy - upload processing text models to Portuguese
#     Lemmatizing

#!python -m spacy download pt_core_news_md 
#!python -m spacy download pt_core_news_lg 
nlp = spacy.load("pt_core_news_md")

# Contador de texto do SKLEARN - scikit-learn Machine Learning in Python
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gusta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gusta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
def clear_sentence(sentence):

    text = re.sub(r"[^\w\s]"," ",sentence)
    text = nltk.word_tokenize(text.lower())
    text = [word for word in text if word not in swords]
    text = " ".join(text)
    doc = nlp(text)
    text = [token.lemma_ for token in doc if not token.is_punct]
    text = " ".join(text).lower()

    return text

In [5]:
clean_sentences_array = []
clean_feelings_array = []
for sentence, feel in zip(sentences, feelings):
    sentece_clean = clear_sentence(sentence)
    if sentece_clean not in clean_sentences_array:
      clean_sentences_array.append(sentence)
      clean_feelings_array.append(feel)


In [6]:
vectorizer = CountVectorizer()
vectorizer.fit(clean_sentences_array)
vocab = vectorizer.get_feature_names_out()
print("Vocabulário:", vocab)

Vocabulário: ['10' '100' '18' ... 'único' 'únicos' 'útil']


In [None]:

bag_of_words = vectorizer.fit_transform(clean_sentences_array)


df_bag_of_words = pd.DataFrame(bag_of_words.toarray(), columns=vocab)

df_bag_of_words['Frases'] = clean_sentences_array

colunas = ['Frases'] + list(vocab)
df_bag_of_words = df_bag_of_words[colunas]


df_bag_of_words['Sentimentos'] = clean_feelings_array

colunas = ['Frases'] + ['Sentimentos'] + list(vocab)
df_bag_of_words = df_bag_of_words[colunas]

df_bag_of_words.head()

In [9]:
from sklearn.svm import LinearSVC
x_train = bag_of_words #.toarray()
y_train = clean_feelings_array

model = LinearSVC(loss='hinge',C=1e18, max_iter=10000)

svmfit = model.fit(x_train,y_train)

w_sklearn = svmfit.coef_[0]

c_sklearn = - svmfit.intercept_
print(" w , c ")
print(w_sklearn, c_sklearn)

 w , c 
[ 1.98721312e+00 -2.77645498e+00  7.10623224e-01 ...  5.55111512e-17
  0.00000000e+00  7.75053976e+00] [0.49561575]




In [35]:
with open("test_assets/positive.txt", 'r', encoding='utf-8') as file:
    # Read all the lines of the file into a list
    test_positive_sentences = file.readlines()

with open("test_assets/negative.txt", 'r', encoding='utf-8') as file:
    # Read all the lines of the file into a list
    test_negative_sentences = file.readlines()

test_sentences = test_positive_sentences + test_negative_sentences
test_feelings = [+1]*len(test_positive_sentences) + [-1]*len(test_negative_sentences)

test_clean_sentences = []
for sentence in test_sentences:
    test_clean_sentences.append(clear_sentence(sentence))

vector_test = vectorizer.transform(test_clean_sentences).toarray()


(41,)
(41, 5221)
(5221, 1)


In [43]:
erros = 0
for i in range(len(test_sentences)):
  #print()
  #print(test_sentences[i])
  x = vector_test[i].reshape(len(w_sklearn),1)
  classific = model.decision_function(x.T) #np.dot(w_sklearn.T, x) - c_sklearn
  if classific > 0 :
    #print(" Classificação original :", ("Positiva" if test_feelings[i]> 0 else "Negativa") )
    #print(" Modelo classificou como: Positiva! wx-c:", np.dot(w_sklearn.T, x) - c_sklearn, model.decision_function(x.T))
    if test_feelings[i] < 0:
      erros = erros +1
  else:
    #print(" Classificação original :", ("Positiva" if test_feelings[i]> 0 else "Negativa") )
    #print(" Modelo classificou como: Negativa! w.Tx-c:", np.dot(w_sklearn.T, x) - c_sklearn, model.decision_function(x.T))
    if test_feelings[i] > 0:
      erros = erros +1

print()
print("Taxa de erro para as frases testes:", 100*float(erros) / len(test_sentences), " %")


Taxa de erro para as frases testes: 14.634146341463415  %


In [None]:
continuar = True
while continuar:
    chute = input("Teste uma frase para avaliar o sentimento: ")
    clean = clear_sentence(chute)
    format = [clean]
    vector = vectorizer.transform(format).toarray()
    print(vector.shape)
    x = vector.reshape(len(w_sklearn),1)
    classific = model.decision_function(x.T) #np.dot(w_sklearn.T, x) - c_sklearn
    if classific > 0:
        print("O modelo classificou como uma frase positiva!")
    else:
        print("O modelo classificou como uma frase negativa!")
    wanna_cont = input("Quer continuar? (s/n)")
    if wanna_cont == 'n' or wanna_cont != 's':
        continuar = False