In [58]:
import spacy
from spacy import displacy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.svm import LinearSVC
import string
import pickle

In [2]:
nlp = spacy.load("pt_core_news_lg")

In [3]:
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)
print(len(stopwords))

326


In [4]:
punct = string.punctuation
print(punct)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [5]:
def dataCleaning(sentence):
  doc = nlp(sentence)
  tokens = []
  for token in doc:
    if token.lemma_ != '-PRON-':
      temp = token.lemma_.lower().strip()
    else:
      temp = token.lower_
    tokens.append(temp)
  clean_tokens = []
  for token in tokens:
    if token not in punct and token not in stopwords:
      clean_tokens.append(token)
  return clean_tokens

In [6]:
comments = pd.read_excel("base.xlsx")

In [7]:
comments.head()

Unnamed: 0,Descrição,Label
0,"não consegui acessar o AVA, preciso avançar no...",0
1,Rematrícula,0
2,Não estou conseguindo entrar no curso.,0
3,Não consegui pagar o boleto,0
4,telefones de contato,0


In [8]:
comments_X = comments["Descrição"]
comments_y = comments["Label"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(comments_X,comments_y, test_size=0.2)

In [10]:
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(124373,) (124373,)
(31094,) (31094,)


In [11]:
X_train = X_train.map(str)
X_test = X_test.map(str)

In [12]:
tfidf = TfidfVectorizer(tokenizer = dataCleaning)
svm = LinearSVC()
steps = [('tfidf',tfidf),('svm',svm)]
pipe = Pipeline(steps)

In [13]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function dataCleaning at 0x000002822E5B8E50>)),
                ('svm', LinearSVC())])

In [14]:
y_pred = pipe.predict(X_test)

In [15]:
print(classification_report(y_test,y_pred))
print("\n\n")# 1 = Comentário positivo
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30956
           1       0.72      0.37      0.49       138

    accuracy                           1.00     31094
   macro avg       0.86      0.68      0.74     31094
weighted avg       1.00      1.00      1.00     31094




[[30936    20]
 [   87    51]]


In [56]:
# 0 = Comentário negativo
# 1 = Comentário positivo
pipe.predict(["Login difícil"])

array([0], dtype=int64)

In [59]:
pickle.dump(pipe, open("classificador.pkl", 'wb'))