# Анализ тональности отзывов

Сначала возьмем выборку отзывов на фильмы из NLTK:

In [19]:
import nltk
from nltk.corpus import movie_reviews
import numpy as np

negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

In [2]:
# для сохранения ответов
def saveAnswer(data, name):
    with open(name, 'w') as file:
        file.write(data)

Приготовим список текстов и классов как обучающую выборку:

In [3]:
negfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in negids]
posfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in posids]

texts = negfeats + posfeats
labels = [0] * len(negfeats) + [1] * len(posfeats)

In [4]:
print len(negfeats)
print len(posfeats)
print len(texts)

saveAnswer(str(len(texts)), 'finalAnswer1.txt')

rez = len(posfeats)/(len(texts)*1.)
saveAnswer(str(rez), 'finalAnswer2.txt')
print rez

1000
1000
2000
0.5


Импортируем нужные нам модули

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline



###Оценка качества работы разных классификаторов

In [6]:
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(texts)

In [7]:
print matrix.shape
saveAnswer(str(matrix.shape[1]), 'finalAnswer3.txt')

(2000, 39659)


In [8]:
pipe = Pipeline(
            [("vectorizer", vectorizer),
            ("classifier", LogisticRegression())]
        )

score = cross_val_score(pipe, texts, labels, scoring='accuracy').mean()
print score
saveAnswer(str(score), 'finalAnswer4.txt')

0.836021650393


In [9]:
score = cross_val_score(pipe, texts, labels, scoring='roc_auc').mean()
print score
saveAnswer(str(score), 'finalAnswer5.txt')

0.910776493783


In [10]:
fit = pipe.fit_transform(texts, labels)



In [11]:
clf = pipe.get_params()['classifier']
vct = pipe.get_params()['vectorizer']

In [12]:
coef = clf.coef_[0]
coefSort = sorted(range(len(coef)), key=lambda k: coef[k])

In [13]:
print [coef[val] for val in coefSort[:5]]
features = vct.get_feature_names()
attrs = [features[val] for val in coefSort[:5]]
print attrs

saveAnswer(' '.join(attrs[:2]), 'finalAnswer6.txt')

[-0.78217635600981117, -0.63661880890828315, -0.59290172592717927, -0.50817891600105913, -0.50398874258300197]
[u'bad', u'unfortunately', u'worst', u'waste', u'nothing']


#2 week

In [14]:
def text_classifier(vectorizer, classifier):
    return Pipeline(
            [("vectorizer", vectorizer),
            ("classifier", classifier)]
        )

In [15]:
# Проведем оценку среднего качества и стандартное отклонение с векторизаторами CountVectorizer и TfidfVectorizer
rez = np.array([])
for vect in [CountVectorizer, TfidfVectorizer]:
    scores = cross_val_score(text_classifier(vect(), LogisticRegression()), texts, labels, cv=5)
    print vect,scores.mean(),' ', scores.std(),'\n'
    rez = np.append(rez, scores.mean())
    rez = np.append(rez, scores.std())

attrs = [str(val) for val in rez]
saveAnswer(' '.join(attrs), '2-finalAnswer1.txt')

<class 'sklearn.feature_extraction.text.CountVectorizer'> 0.841   0.0167779617356 

<class 'sklearn.feature_extraction.text.TfidfVectorizer'> 0.821   0.00406201920232 



In [16]:
# Проведем оценку среднего качества и стандартное отклонение у CountVectorizer с разными min_df
rez = np.array([])
for min_df in [10, 50]:
    scores = cross_val_score(text_classifier(CountVectorizer(min_df=min_df), LogisticRegression()), texts, labels, cv=5)
    print 'min_df=',min_df,' ',scores.mean(),' ', scores.std(),'\n'
    rez = np.append(rez, scores.mean())

attrs = [str(val) for val in rez]
saveAnswer(' '.join(attrs), '2-finalAnswer2.txt')

min_df= 10   0.839   0.0118953772534 

min_df= 50   0.813   0.0134536240471 



In [17]:
# Проведем оценку разных классификаторов с CountVectorizer
minScore = -1
for clf in [LogisticRegression, LinearSVC, SGDClassifier]:
    score = cross_val_score(text_classifier(CountVectorizer(), clf()), texts, labels).mean()
    print clf,score
    if minScore == -1 or minScore > score:
        minScore = score
        
print 'minScore', minScore        
saveAnswer(str(minScore), '2-finalAnswer3.txt')

<class 'sklearn.linear_model.logistic.LogisticRegression'> 0.836021650393
<class 'sklearn.svm.classes.LinearSVC'> 0.827517637398
<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> 0.765464566363
minScore 0.765464566363


In [20]:
# Проверим качество векторизатора с разными наборами стоп слов
rez = np.array([])
for stopW in [nltk.corpus.stopwords.words('english'), 'english']:
    scores = cross_val_score(text_classifier(CountVectorizer(stop_words=stopW), LogisticRegression()), texts, labels, cv=5)
    print scores.mean(),' ', scores.std(),'\n'
    rez = np.append(rez, scores.mean())

attrs = [str(val) for val in rez]
saveAnswer(' '.join(attrs), '2-finalAnswer4.txt')

0.8415   0.0104403065089 

0.839   0.00982344135219 



In [21]:
# Проверим качество векторизатора с разными биграммами
scoresWord = cross_val_score(text_classifier(CountVectorizer(ngram_range=(1, 2)), LogisticRegression()), texts, labels, cv=5)
print scoresWord.mean(),' ', scoresWord.std(),'\n'
scoresChar = cross_val_score(text_classifier(CountVectorizer(ngram_range=(3, 5), analyzer='char_wb'), LogisticRegression()), texts, labels, cv=5)
print scoresChar.mean(),' ', scoresChar.std(),'\n'

attrs = [str(val) for val in [scoresWord.mean(), scoresChar.mean()]]
saveAnswer(' '.join(attrs), '2-finalAnswer5.txt')

0.8525   0.0165075740192 

0.82   0.0106066017178 

