# Анализ тональности отзывов

Сначала возьмем выборку отзывов на фильмы из NLTK:

In [1]:
from nltk.corpus import movie_reviews
import numpy as np

negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

In [2]:
# для сохранения ответов
def saveAnswer(data, name):
    with open(name, 'w') as file:
        file.write(data)

Приготовим список текстов и классов как обучающую выборку:

In [3]:
negfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in negids]
posfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in posids]

texts = negfeats + posfeats
labels = [0] * len(negfeats) + [1] * len(posfeats)

In [4]:
print len(negfeats)
print len(posfeats)
print len(texts)

saveAnswer(str(len(texts)), 'finalAnswer1.txt')

rez = len(posfeats)/(len(texts)*1.)
saveAnswer(str(rez), 'finalAnswer2.txt')
print rez

1000
1000
2000
0.5


Импортируем нужные нам модули

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline



###Оценка качества работы разных классификаторов

In [6]:
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(texts)

In [7]:
print matrix.shape
saveAnswer(str(matrix.shape[1]), 'finalAnswer3.txt')

(2000, 39659)


In [9]:
pipe = Pipeline(
            [("vectorizer", vectorizer),
            ("classifier", LogisticRegression())]
        )

score = cross_val_score(pipe, texts, labels, scoring='accuracy').mean()
print score
saveAnswer(str(score), 'finalAnswer4.txt')

0.836021650393


In [10]:
score = cross_val_score(pipe, texts, labels, scoring='roc_auc').mean()
print score
saveAnswer(str(score), 'finalAnswer5.txt')

0.910776493783


In [11]:
fit = pipe.fit_transform(texts, labels)



In [12]:
clf = pipe.get_params()['classifier']
vct = pipe.get_params()['vectorizer']

In [13]:
coef = clf.coef_[0]
coefSort = sorted(range(len(coef)), key=lambda k: coef[k])

In [14]:
print [coef[val] for val in coefSort[:5]]
features = vct.get_feature_names()
attrs = [features[val] for val in coefSort[:5]]
print attrs

saveAnswer(' '.join(attrs[:2]), 'finalAnswer6.txt')

[-0.78217635600981117, -0.63661880890828315, -0.59290172592717927, -0.50817891600105913, -0.50398874258300197]
[u'bad', u'unfortunately', u'worst', u'waste', u'nothing']
