In [1]:
from functools import reduce

import pandas as pd
import numpy as np

In [2]:
questions = pd.read_csv('data/stackoverflow_perguntas.csv')
questions.sample(5)

Unnamed: 0,Perguntas,Tags
4319,"Preciso do arquivo de áudio do messenger, para...",node.js
5090,"estou fazendo um cadastro via ajax, e estou re...",jquery
158,Quero saber o espaço entre a parte de baixo da...,jquery html
4689,Tenho a seguinte estrutura: CODE Quando cli...,jquery
1647,Possuo uma div com CODE e gostaria que o text...,jquery html


In [3]:
questions.shape

(5408, 2)

In [4]:
combination_tags = questions.Tags.unique()

print(len(combination_tags))
combination_tags

37


array(['node.js', 'jquery', 'html', 'html angular ', 'html ', 'angular',
       'angular ', 'jquery html  ', 'jquery ', 'jquery html',
       'jquery html ', 'html angular', 'angular node.js ', 'html  ',
       'jquery html angular', 'node.js ', 'html jquery', 'html jquery ',
       'jquery angular  ', 'html node.js', 'jquery  ', 'angular node.js',
       'jquery angular', 'html node.js ', 'jquery node.js ', 'angular  ',
       'jquery angular ', 'jquery html angular ', 'node.js html ',
       ' node.js', 'node.js html', 'html angular  ', 'jquery node.js',
       'angular html', 'html angular  node.js', 'jquery html node.js',
       'html angular node.js'], dtype=object)

In [5]:
from sklearn.preprocessing import MultiLabelBinarizer


binarizer = MultiLabelBinarizer()
target = binarizer.fit_transform([set(tags.split()) for tags in questions.Tags])

print(binarizer.classes_)
print(target)

['angular' 'html' 'jquery' 'node.js']
[[0 0 0 1]
 [0 0 0 1]
 [0 0 1 0]
 ...
 [0 1 1 0]
 [0 1 0 0]
 [0 1 1 0]]


In [6]:
from sklearn.model_selection import train_test_split

SEED = 42

X_train, X_test, Y_train, Y_test = train_test_split(
    questions.Perguntas,
    target,
    test_size=0.2,
    stratify=target,
    random_state=42
)

In [7]:
X_train.shape

(4326,)

In [8]:
X_test.shape

(1082,)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, max_df=0.85)

In [10]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [11]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()
or_clf = OneVsRestClassifier(rf_clf)

In [12]:
or_clf.fit(X_train_tfidf, Y_train)

In [13]:
or_clf.score(X_test_tfidf, Y_test)

0.41589648798521256

In [14]:
from sklearn.metrics import hamming_loss

predicted = or_clf.predict(X_test_tfidf)
hamming_loss(Y_test, predicted)

0.1827634011090573

In [15]:
!pip install scikit-multilearn



In [16]:
from skmultilearn.problem_transform import ClassifierChain

rf_clf = RandomForestClassifier()
cc_clf = ClassifierChain(rf_clf)

In [17]:
cc_clf.fit(X_train_tfidf, Y_train)

In [18]:
cc_clf.score(X_test_tfidf, Y_test)

0.5249537892791127

In [19]:
predicted = cc_clf.predict(X_test_tfidf)
hamming_loss(Y_test, predicted)

0.17837338262476896

In [20]:
from skmultilearn.problem_transform import BinaryRelevance

rf_clf = RandomForestClassifier()
br_clf = BinaryRelevance(rf_clf)

In [21]:
br_clf.fit(X_train_tfidf, Y_train)

In [22]:
br_clf.score(X_test_tfidf, Y_test)

0.42606284658040666

In [23]:
predicted = br_clf.predict(X_test_tfidf)
hamming_loss(Y_test, predicted)

0.18322550831792975