In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import classification_report

In [2]:
from src.utils import *
from src.configs import labels_dict

In [3]:
data = get_data(labels_dict=labels_dict)

Finish ...


In [4]:
data

array([[ 1,  2,  1, ..., -2, -2,  0],
       [ 2,  1,  2, ..., -2, -2,  0],
       [ 2,  1,  1, ..., -2, -2,  0],
       ...,
       [ 2,  3,  2, ..., -2, -2,  1],
       [ 2,  3,  2, ...,  2,  2,  1],
       [ 2,  1,  1, ...,  3,  2,  1]])

In [5]:
# tenho 200 linhas 60 colunas
data.shape

(200, 31)

In [6]:
data[:, -1].shape

(200,)

In [7]:
X = data[:, :-1]
y = data[:, -1] 

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    shuffle=True,
    random_state=42,
    stratify=y
)

In [9]:
# model = make_pipeline(StandardScaler(), LinearSVC(dual='auto'))
model = make_pipeline(StandardScaler(), LogisticRegression())

In [10]:
model.fit(X_train, y_train)

In [11]:
y_hat = model.predict(X_test)

In [12]:
y_test

array([0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [13]:
print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90        20
           1       0.90      0.90      0.90        20

    accuracy                           0.90        40
   macro avg       0.90      0.90      0.90        40
weighted avg       0.90      0.90      0.90        40



In [14]:
cross_validate_ = cross_validate(estimator=model, X=X_train, y=y_train, cv=10, scoring='accuracy', return_train_score=True)

In [15]:
pd.DataFrame(cross_validate_)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.004434,0.001122,0.8125,0.979167
1,0.004347,0.001098,0.9375,0.965278
2,0.004498,0.0,0.8125,0.965278
3,0.005464,0.001086,0.875,0.958333
4,0.00641,0.0,0.9375,0.965278
5,0.004034,0.003996,0.9375,0.965278
6,0.004008,0.0,0.9375,0.965278
7,0.003993,0.0,0.875,0.986111
8,0.004017,0.0,0.9375,0.965278
9,0.003997,0.0,1.0,0.958333


In [27]:
# 0: linha boa
# 1: linha ruim

In [29]:
print(model.predict_proba(prep_one_text('== ; ps o is gar a')))
print(model.predict(prep_one_text('== ; ps o is gar a')))

[[0.00999029 0.99000971]]
[1]


In [30]:
print(model.predict_proba(prep_one_text('bom texto não preciso dizer mais nada')))
print(model.predict(prep_one_text('bom texto não preciso dizer mais nada')))

[[0.96160822 0.03839178]]
[0]
