## Definitions

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

## Exploration

In [16]:
dataset = pd.read_csv('dataset.csv')

In [17]:
dataset.shape

(521, 2)

In [18]:
dataset[dataset['sentence'].isna()]

Unnamed: 0,sentence,category


In [19]:
dataset[dataset['category'].isna()]

Unnamed: 0,sentence,category


In [20]:
dataset.category.value_counts()

category
orgão público               140
educação                    107
indústrias                   89
varejo                       85
finanças                     54
finanças,varejo              13
educação,orgão público        9
indústrias,varejo             7
educação,indústrias           5
finanças,orgão público        4
finanças,indústrias           3
indústrias,orgão público      2
educação,finanças             2
varejo,indústrias             1
Name: count, dtype: int64

## using the category value as dummies

In [21]:
dataset = pd.concat([dataset['sentence'], dataset['category'].str.get_dummies(sep=',')], axis=1) 

## split

In [25]:
generator1 = torch.Generator().manual_seed(0)
train_set, val_set, test_set = torch.utils.data.random_split(dataset, [0.6, 0.2, 0.2], generator=generator1)

In [26]:
train_set = train_set.dataset.iloc[train_set.indices]
val_set = val_set.dataset.iloc[val_set.indices]
test_set = test_set.dataset.iloc[test_set.indices]

In [27]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [28]:
X_train = train_set['sentence']
X_val = val_set['sentence']
X_test = test_set['sentence']
y_train = train_set.drop(['sentence'],axis=1)
y_val = val_set.drop(['sentence'],axis=1)
y_test = test_set.drop(['sentence'],axis=1)

Ajustes no X e y

## vectorization

In [29]:
vect = CountVectorizer()

X_train_transf = vect.fit_transform(X_train)
X_val_transf = vect.transform(X_val)
X_test_transf = vect.transform(X_test)

In [263]:
classes = ['educação','finanças','indústrias','orgão','público','varejo']

In [264]:
X_val

334    Medidas de redução de perdas de água em redes ...
358                                 Cesta Páscoa de Amor
15     Como saber o limite do cartão de crédito e o v...
291    Mensalidade em 2021 apenas 15 x 480,00 também ...
62                          Educação Especial 120 Horas.
                             ...                        
244                                   Instrutor de curso
389    Todo correntista do ITAÚ já possui uma conta p...
443    Serão aceitos no máximo três trabalhos com o m...
422                                   Dicas do Professor
79     Egressos de Fisioterapia da UNESC assumem como...
Name: sentence, Length: 104, dtype: object

In [265]:
X_train_transf

<313x1432 sparse matrix of type '<class 'numpy.int64'>'
	with 2947 stored elements in Compressed Sparse Row format>

## Model testing

### In cases where the models classify the text as not belonging to any class, choose the class with the highest probability

In [35]:
def consider_at_least_one_class(prediction, y_predicted):

    prediction2 = prediction.copy()
    y_predicted_np = np.array(y_predicted) 
    y_predicted_np_moved = np.moveaxis(y_predicted_np, 0, -1)
    y_predicted_np_moved_pos = y_predicted_np_moved[:,0,:]
    argmax = np.argmax(y_predicted_np_moved_pos, axis=1)

    for i in range(0, len(prediction2)):
        #if we never came up with a prediction, use our "forced" single prediction
        if (all(proba == 0 for proba in prediction2[i])):
            prediction2[i, argmax[i]] = 1

    return prediction2

### nayve bayes

In [895]:
classifier = MultiOutputClassifier(MultinomialNB(alpha=0.8))

classifier.fit(X_train_transf, y_train)
y_predicted = classifier.predict_proba(X_val_transf)
prediction = classifier.predict(X_val_transf)
prediction[1:10]

array([[0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]], dtype=int64)

In [896]:
print('Accuracy Score: ', accuracy_score(y_val, prediction))

Accuracy Score:  0.5673076923076923


In [897]:
prediction_at_least_one = consider_at_least_one_class(prediction, y_predicted)

In [898]:
print('Accuracy Score: ', accuracy_score(y_val, prediction_at_least_one))

Accuracy Score:  0.5769230769230769


## logistic regression

In [320]:
from sklearn.linear_model import LogisticRegression

In [413]:
classifier = MultiOutputClassifier(LogisticRegression(C=100))

classifier.fit(X_train_transf, y_train)
y_predicted = classifier.predict_proba(X_val_transf)
prediction = classifier.predict(X_val_transf)

In [414]:
print('Accuracy Score: ', accuracy_score(y_val, prediction))

Accuracy Score:  0.4423076923076923


In [415]:
prediction_at_least_one = consider_at_least_one_class(prediction, y_predicted)

In [416]:
print('Accuracy Score: ', accuracy_score(y_val, prediction_at_least_one))

Accuracy Score:  0.4807692307692308


## multi layer perceptron NN

In [940]:
from sklearn.neural_network import MLPClassifier
classifier = MultiOutputClassifier(MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5,5,2), random_state=1))

In [947]:
classifier.fit(X_train_transf, y_train)
y_predicted = classifier.predict_proba(X_val_transf)
prediction = classifier.predict(X_val_transf)

In [948]:
print('Accuracy Score: ', accuracy_score(y_val, prediction))

Accuracy Score:  0.49038461538461536


In [949]:
print('Precision Score: ',precision_score(y_val, prediction, average='weighted'))

Precision Score:  0.7711163060541942


In [950]:
prediction_at_least_one = consider_at_least_one_class(prediction, y_predicted)

In [951]:
print('Accuracy Score: ', accuracy_score(y_val, prediction_at_least_one))

Accuracy Score:  0.5865384615384616


In [952]:
print('Precision Score: ',precision_score(y_val, prediction, average='weighted'))

Precision Score:  0.7711163060541942


## SVM

In [532]:
from sklearn.svm import SVC

In [675]:
classifier = MultiOutputClassifier(SVC(kernel='linear',probability=True, C=2))

In [676]:
classifier.fit(X_train_transf, y_train)
y_predicted = classifier.predict_proba(X_val_transf)
prediction = classifier.predict(X_val_transf)

In [677]:
print('Accuracy Score: ', accuracy_score(y_val, prediction))

Accuracy Score:  0.5288461538461539


In [678]:
prediction_at_least_one = consider_at_least_one_class(prediction, y_predicted)

In [679]:
print('Accuracy Score: ', accuracy_score(y_val, prediction_at_least_one))

Accuracy Score:  0.5576923076923077


In [688]:
from sklearn.neighbors import KNeighborsClassifier

In [720]:
classifier = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='auto', p=1))

In [721]:
classifier.fit(X_train_transf, y_train)
y_predicted = classifier.predict_proba(X_val_transf)
prediction = classifier.predict(X_val_transf)

In [722]:
print('Accuracy Score: ', accuracy_score(y_val, prediction))

Accuracy Score:  0.1346153846153846


In [723]:
prediction_at_least_one = consider_at_least_one_class(prediction, y_predicted)

In [724]:
print('Accuracy Score: ', accuracy_score(y_val, prediction_at_least_one))

Accuracy Score:  0.15384615384615385


## Random Forest

In [725]:
from sklearn.ensemble import RandomForestClassifier

In [753]:
classifier = MultiOutputClassifier(RandomForestClassifier(n_estimators=200))

In [754]:
classifier.fit(X_train_transf, y_train)
y_predicted = classifier.predict_proba(X_val_transf)
prediction = classifier.predict(X_val_transf)

In [755]:
print('Accuracy Score: ', accuracy_score(y_val, prediction))

Accuracy Score:  0.23076923076923078


In [756]:
prediction_at_least_one = consider_at_least_one_class(prediction, y_predicted)

In [757]:
print('Accuracy Score: ', accuracy_score(y_val, prediction_at_least_one))

Accuracy Score:  0.25


## GradientBoostingClassifier

In [758]:
from sklearn.ensemble import GradientBoostingClassifier

In [814]:
classifier = MultiOutputClassifier(GradientBoostingClassifier(loss='log_loss', learning_rate=0.5))

In [815]:
classifier.fit(X_train_transf, y_train)
y_predicted = classifier.predict_proba(X_val_transf)
prediction = classifier.predict(X_val_transf)

In [816]:
print('Accuracy Score: ', accuracy_score(y_val, prediction))

Accuracy Score:  0.4230769230769231


In [817]:
prediction_at_least_one = consider_at_least_one_class(prediction, y_predicted)

In [818]:
print('Accuracy Score: ', accuracy_score(y_val, prediction_at_least_one))

Accuracy Score:  0.46153846153846156


## XGboost

In [914]:
from xgboost.sklearn import XGBClassifier

In [915]:
classifier = MultiOutputClassifier(XGBClassifier(n_estimators = 650,
                    max_depth = 10, 
                    learning_rate = 0.01,
                    random_state=1))

In [916]:
classifier.fit(X_train_transf, y_train)
y_predicted = classifier.predict_proba(X_val_transf)
prediction = classifier.predict(X_val_transf)

In [917]:
print('Accuracy Score: ', accuracy_score(y_val, prediction))

Accuracy Score:  0.25


In [918]:
prediction_at_least_one = consider_at_least_one_class(prediction, y_predicted)

In [919]:
print('Accuracy Score: ', accuracy_score(y_val, prediction_at_least_one))

Accuracy Score:  0.3076923076923077


## the NN was the mdoel with best accuracy

In [985]:
from sklearn.neural_network import MLPClassifier
final_classifier = MultiOutputClassifier(MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5,5,2), random_state=1))

In [986]:
final_classifier.fit(X_train_transf, y_train)

y_predicted_val = final_classifier.predict_proba(X_val_transf)
y_predicted_test = final_classifier.predict_proba(X_test_transf)

prediction_val = final_classifier.predict(X_val_transf)
prediction_test = final_classifier.predict(X_test_transf)

In [987]:
prediction_at_least_one_val = consider_at_least_one_class(prediction_val, y_predicted_val)
prediction_at_least_one_test = consider_at_least_one_class(prediction_test, y_predicted_val)

In [988]:
print('Accuracy Score val: ', accuracy_score(y_val, prediction_at_least_one_val))

Accuracy Score val:  0.5865384615384616


In [989]:
print('Accuracy Score test: ', accuracy_score(y_test, prediction_at_least_one_test))

Accuracy Score test:  0.5576923076923077


In [990]:
print('Precision Score val: ',precision_score(y_val, prediction_at_least_one_val, average='weighted'))

Precision Score val:  0.6874989267557519


In [991]:
print('Precision Score test: ',precision_score(y_test, prediction_at_least_one_test, average='weighted'))

Precision Score test:  0.7245674670547337


In [992]:
print('Recall Score val: ',recall_score(y_val, prediction_at_least_one_val, average='weighted'))

Recall Score val:  0.6875


In [993]:
print('Recall Score test: ',recall_score(y_test, prediction_at_least_one_test, average='weighted'))

Recall Score test:  0.6228070175438597


## Considering that the model classifies 5 different classes simultaniously, an accuracy of 55.7% to hit all the classes presence right can be considered a reasonably good percentage. It could have been interesting to use other metrics besides accuracy, such as precision and recall (especially considering that the classes are unbalanced)

In [979]:
from sklearn.metrics import multilabel_confusion_matrix

In [994]:
multilabel_confusion_matrix(y_test, prediction_at_least_one_test)

array([[[60, 23],
        [ 4, 17]],

       [[87,  1],
        [ 9,  7]],

       [[77,  3],
        [13, 11]],

       [[63,  5],
        [10, 26]],

       [[81,  6],
        [ 7, 10]]], dtype=int64)

## We could also analyse each classs separatly

## Training with all data 

In [43]:
from sklearn.neural_network import MLPClassifier
final_classifier = MultiOutputClassifier(MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5,5,2), random_state=1))

In [44]:
X_dataset = dataset['sentence']
y_dataset = dataset.drop(['sentence'],axis=1)
X_dataset_transf = vect.transform(X_dataset)

In [45]:
final_classifier.fit(X_dataset_transf, y_dataset)

y_predicted = final_classifier.predict_proba(X_dataset_transf)

prediction = final_classifier.predict(X_dataset_transf)

In [46]:
prediction_at_least_one = consider_at_least_one_class(prediction, y_predicted)

In [47]:
print('Accuracy Score: ', accuracy_score(y_dataset, prediction_at_least_one))

Accuracy Score:  0.9424184261036468


In [48]:
print('Precision Score: ',precision_score(y_dataset, prediction_at_least_one, average='weighted'))

Precision Score:  0.949723197832892


In [49]:
print('Recall Score: ',recall_score(y_dataset, prediction_at_least_one, average='weighted'))

Recall Score:  0.9964726631393298


In [53]:
import pickle

with open('model_NN.pkl','wb') as f:
    pickle.dump((final_classifier, vect),f)

In [54]:
loaded_model, vect = pickle.load(open('model_NN.pkl', 'rb'))
result = loaded_model.score(X_dataset_transf, y_dataset)
print(result)

0.9424184261036468


In [55]:
X_dataset_transf

<521x1432 sparse matrix of type '<class 'numpy.int64'>'
	with 4164 stored elements in Compressed Sparse Row format>

In [56]:
dataset

Unnamed: 0,sentence,educação,finanças,indústrias,orgão público,varejo
0,"Auxílio-Doença Previdenciário, Benefícios em E...",0,0,0,1,0
1,"PAGAR TODAS AS CONTAS EM ATRASO R$1.290,90.",0,1,0,0,0
2,Então encontraremos na próxima aula.,1,0,0,0,0
3,Veja os resultados da categoria de ofertas do ...,0,0,1,0,0
4,"Além disso, a embalagem é reutilizável e 100% ...",0,0,1,0,1
...,...,...,...,...,...,...
516,"Selecione o local de estudo, curso sem encontr...",1,0,0,0,0
517,ESTUDANTES DA REDE MUNICIPAL VOLTAM ÀS AULAS E...,1,0,0,1,0
518,Empresas e órgãos públicos,0,0,0,1,0
519,DGE – Departamento de Gestão Estratégica Metas...,0,0,0,1,0
