In [26]:
import pandas as pd

In [27]:
df = pd.read_csv('stackoverflow_perguntas.csv')
df.head()

Unnamed: 0,Perguntas,Tags
0,Possuo um projeto Node.js porém preciso criar ...,node.js
1,"Gostaria de fazer testes unitários no Node.js,...",node.js
2,Como inverter a ordem com que o jQuery itera u...,jquery
3,Eu tenho uma página onde pretendo utilizar um ...,html
4,Como exibir os dados retornados do FireStore e...,html angular


In [28]:
len(df)

5408

In [29]:
df.Tags.unique()

array(['node.js', 'jquery', 'html', 'html angular ', 'html ', 'angular',
       'angular ', 'jquery html  ', 'jquery ', 'jquery html',
       'jquery html ', 'html angular', 'angular node.js ', 'html  ',
       'jquery html angular', 'node.js ', 'html jquery', 'html jquery ',
       'jquery angular  ', 'html node.js', 'jquery  ', 'angular node.js',
       'jquery angular', 'html node.js ', 'jquery node.js ', 'angular  ',
       'jquery angular ', 'jquery html angular ', 'node.js html ',
       ' node.js', 'node.js html', 'html angular  ', 'jquery node.js',
       'angular html', 'html angular  node.js', 'jquery html node.js',
       'html angular node.js'], dtype=object)

In [30]:
df.Tags.nunique()

37

In [31]:
labels = []

for tags in df.Tags.unique():
    for tag in tags.split():
        if not tag in labels:
            labels.append(tag)

labels

['node.js', 'jquery', 'html', 'angular']

In [32]:
for label in labels:
    df[label] = 0
    df.loc[df['Tags'].apply(lambda tags: label in tags), label] = 1

df.head()  

Unnamed: 0,Perguntas,Tags,node.js,jquery,html,angular
0,Possuo um projeto Node.js porém preciso criar ...,node.js,1,0,0,0
1,"Gostaria de fazer testes unitários no Node.js,...",node.js,1,0,0,0
2,Como inverter a ordem com que o jQuery itera u...,jquery,0,1,0,0
3,Eu tenho uma página onde pretendo utilizar um ...,html,0,0,1,0
4,Como exibir os dados retornados do FireStore e...,html angular,0,0,1,1


In [33]:
df['Labels'] = list(zip(*[df[label] for label in labels]))
df.head()

Unnamed: 0,Perguntas,Tags,node.js,jquery,html,angular,Labels
0,Possuo um projeto Node.js porém preciso criar ...,node.js,1,0,0,0,"(1, 0, 0, 0)"
1,"Gostaria de fazer testes unitários no Node.js,...",node.js,1,0,0,0,"(1, 0, 0, 0)"
2,Como inverter a ordem com que o jQuery itera u...,jquery,0,1,0,0,"(0, 1, 0, 0)"
3,Eu tenho uma página onde pretendo utilizar um ...,html,0,0,1,0,"(0, 0, 1, 0)"
4,Como exibir os dados retornados do FireStore e...,html angular,0,0,1,1,"(0, 0, 1, 1)"


In [34]:
from sklearn.model_selection import train_test_split

In [35]:
X_train, X_test, y_train, y_test = train_test_split(df.Perguntas, df.Labels, test_size=0.2, random_state=123)

Transformação dos vetores X para o formato TF-IDF

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
tf_idf = TfidfVectorizer(max_features=5000, max_df=0.85)
tf_idf.fit(df.Perguntas)

X_train = tf_idf.transform(X_train)
X_test = tf_idf.transform(X_test)

In [38]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [39]:
regressao_logistica = LogisticRegression()
one_vs_rest = OneVsRestClassifier(estimator=regressao_logistica)

In [40]:
import numpy as np

In [41]:
y_train = np.asarray(list(y_train))
y_test = np.asarray(list(y_test))   

In [42]:
one_vs_rest.fit(X_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression())

In [43]:
one_vs_rest.score(X_test, y_test)

0.4168207024029575

In [44]:
df.Labels.unique()

array([(1, 0, 0, 0), (0, 1, 0, 0), (0, 0, 1, 0), (0, 0, 1, 1),
       (0, 0, 0, 1), (0, 1, 1, 0), (1, 0, 0, 1), (0, 1, 1, 1),
       (0, 1, 0, 1), (1, 0, 1, 0), (1, 1, 0, 0), (1, 0, 1, 1),
       (1, 1, 1, 0)], dtype=object)

In [45]:
1 / df.Labels.nunique()

0.07692307692307693

In [46]:
from sklearn.metrics import hamming_loss 

In [47]:
previsao = one_vs_rest.predict(X_test)
previsao

array([[0, 1, 0, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       ...,
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 0]])

O Hamming Loss realiza a comparação componente a componente de cada linha prevista e verifica a porcentagem de acertos.

In [48]:
h_loss = hamming_loss(y_test, previsao)
h_loss

0.1883086876155268

In [50]:
df[[coluna for coluna in labels]].corr()

Unnamed: 0,node.js,jquery,html,angular
node.js,1.0,-0.321485,-0.273523,-0.101787
jquery,-0.321485,1.0,-0.253977,-0.366269
html,-0.273523,-0.253977,1.0,-0.286706
angular,-0.101787,-0.366269,-0.286706,1.0


O Classifier Chain é uma opção para quando existem fortes correlações entre as variáveis de X. Desta forma, as variáveis influenciarão na predição umas das outras.

In [51]:
from skmultilearn.problem_transform import ClassifierChain

In [52]:
classifier_chain = ClassifierChain(regressao_logistica)

In [53]:
classifier_chain.fit(X_train, y_train)
classifier_chain.score(X_test, y_test)

0.49815157116451014

In [55]:
previsao = classifier_chain.predict(X_test)

In [56]:
hamming_loss(y_test, previsao)

0.21095194085027727

In [57]:
from skmultilearn.problem_transform import BinaryRelevance

In [58]:
binary_relevance = BinaryRelevance(regressao_logistica)

In [59]:
binary_relevance.fit(X_train, y_train)
binary_relevance.score(X_test, y_test)

0.4168207024029575

In [61]:
previsao = binary_relevance.predict(X_test)

In [62]:
hamming_loss(y_test, previsao)

0.1883086876155268