In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from skmultilearn.adapt import MLkNN
from sklearn.metrics import hamming_loss, accuracy_score

In [10]:
aspects_df = pd.read_csv('semeval2014.csv')
aspects_df.head()

Unnamed: 0,text,service,food,anecdotes/miscellaneous,price,ambience
0,but the staff was so horrible to us,1,0,0,0,0
1,to be completely fair the only redeeming facto...,0,1,1,0,0
2,the food is uniformly exceptional with a very ...,0,1,0,0,0
3,where gabriela personaly greets you and recomm...,1,0,0,0,0
4,for those that go once and dont enjoy it all i...,0,0,1,0,0


In [11]:
X = aspects_df["text"]
y = np.asarray(aspects_df[aspects_df.columns[1:]])

In [12]:
vetorizar = TfidfVectorizer(max_features=3000, max_df=0.85)
vetorizar.fit(X)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.85, max_features=3000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [14]:
X_train_tfidf = vetorizar.transform(X_train)
X_test_tfidf = vetorizar.transform(X_test)

In [15]:
mlknn_classifier = MLkNN()
mlknn_classifier.fit(X_train_tfidf, y_train)

MLkNN(ignore_first_neighbours=0, k=10, s=1.0)

In [16]:
new_sentences = ["I like the food but I hate the place"]
new_sentence_tfidf = vetorizar.transform(new_sentences)

predicted_sentences = mlknn_classifier.predict(new_sentence_tfidf)
print(predicted_sentences.toarray())

[[0 1 0 0 1]]


In [17]:
predicted = mlknn_classifier.predict(X_test_tfidf)

print(accuracy_score(y_test, predicted))
print(hamming_loss(y_test, predicted))

0.472636815920398
0.16716417910447762
