# Соревнование по сентимент-анализу

In [1]:
# Загружаем нужные модули
import pandas as pd
import numpy as np
import os
import csv
import re
from copy import deepcopy

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier, LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
### В середине текста (на 305 строке) исходного файла "products_sentiment_test.tsv" содержится символ EOF. Удаляем этот символ из файла и создаем новый файл: "products_sentiment_test (new).tsv"

def delete_eof(fin, fout):
    BUFSIZE = 2**15
    EOFCHAR = chr(26)
    data = fin.read(BUFSIZE)
    while data:
        fout.write(data.translate(None, EOFCHAR.encode('utf-8')))
        data = fin.read(BUFSIZE)
        
ipath = "products_sentiment_test.tsv"
opath = "products_sentiment_test (new).tsv"
with open(ipath, "rb") as fin, open(opath, "wb") as fout:
    delete_eof(fin, fout)

In [3]:
X_train_main = []
y_train_main = []
X_test_main = []

with open("products_sentiment_train.tsv") as file_train:
    reader = csv.reader(file_train, delimiter='\t')
    for row in reader:
        X_train_main.append(row[0])
        y_train_main.append(int(row[1]))

with open("products_sentiment_test.tsv") as file_test:
    reader = csv.reader(file_test, delimiter='\t')
    for row in reader:
        X_test_main.append(re.sub(r"\s+", " ", row[1])) # замена любого количества любых пробелов на один обычный
    X_test_main = X_test_main[1:] # отрежем первую строку с вшитым заголовком

In [4]:
X_train_main[:5]

['2 . take around 10,000 640x480 pictures .',
 'i downloaded a trial version of computer associates ez firewall and antivirus and fell in love with a computer security system all over again .',
 'the wrt54g plus the hga7t is a perfect solution if you need wireless coverage in a wider area or for a hard-walled house as was my case .',
 'i dont especially like how music files are unstructured ; basically they are just dumped into one folder with no organization , like you might have in windows explorer folders and subfolders .',
 'i was using the cheapie pail ... and it worked ok until the opening device fell apart .']

In [5]:
y_train_main[:5]

[1, 1, 1, 0, 1]

In [6]:
print("Отзывов с меткой 1: ", y_train_main.count(1))
print("Отзывов с меткой 0: ", y_train_main.count(0))
print(np.mean(y_train_main))

Отзывов с меткой 1:  1274
Отзывов с меткой 0:  726
0.637


In [7]:
# Создадим pipeline для векторизации, трансформации и классификации:
def pipe(vectorizer, transformer, classifier):
    return Pipeline(
            [('vectorizer', vectorizer),
            ('transformer', transformer),
            ('classifier', classifier)]
        )

Попробуем бейзлайн из прошлой недели:

In [8]:
base_model = pipe(
    vectorizer=CountVectorizer(),
    transformer=TfidfTransformer(),
    classifier=LogisticRegression()
)

In [9]:
print(cross_val_score(base_model, X_train_main, y_train_main, cv=5).mean())

0.7665031843949025


Результат не очень. Попробуем стемминг и лемматизацию

In [18]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [19]:
stemmed_model = pipe(
    vectorizer=StemmedCountVectorizer(),
    transformer=TfidfTransformer(),
    classifier=LogisticRegression()
)

In [20]:
print(cross_val_score(stemmed_model, X_train_main, y_train_main, cv=5).mean())

0.7734994437465234


In [21]:
stem_lemma_model = pipe(
    vectorizer=StemmedCountVectorizer(tokenizer=LemmaTokenizer()),
    transformer=TfidfTransformer(),
    classifier=LogisticRegression()
)

In [22]:
print(cross_val_score(stem_lemma_model, X_train_main, y_train_main, cv=5).mean())

0.7794869624185152


Добавим в корпус внешнюю базу отзывов Illinois, скачанную заранее

http://help.sentiment140.com/for-students

In [26]:
illi = pd.read_csv('illi.csv', encoding='latin1', header=None)

In [61]:
ext_pros = illi.loc[illi[0] == 4][5]
ext_cons = illi.loc[illi[0] == 0][5]

In [23]:
ext_pros = pd.read_table("Illinois/IntegratedPros.txt", index_col=None, header=None, encoding='latin-1') # положительные отзывы
ext_cons = pd.read_table("Illinois/IntegratedCons.txt", index_col=None, header=None, encoding='latin-1') # отрицательные отзывы

FileNotFoundError: File b'Illinois/IntegratedPros.txt' does not exist

In [57]:
# Вырежем отзывы из тегов и избавимся от пробелов:
X_ext_pros = []
for x in ext_pros[0]:
    X_ext_pros.append((x.split("<Pros>"))[1].split("</Pros>")[0])

X_ext_cons = []
for x in ext_cons[0]:
    X_ext_cons.append((x.split("<Cons>"))[1].split("</Cons>")[0])

KeyError: 0

Создадим объединённую базу

In [62]:
X_train = []
X_test = []
y_train = []

X_train.extend(X_train_main)
X_train.extend(ext_pros)
X_train.extend(ext_cons)

y_train.extend(y_train_main)
y_train.extend([1]*len(ext_pros))
y_train.extend([0]*len(ext_cons))

X_test.extend(X_test_main)

print("Размеры списков:")
print("ext_pros, ext_cons:", len(ext_pros), len(ext_cons))
print("X_train, y_train:", len(X_train), len(y_train))
print("X_test:", len(X_test))


Размеры списков:
ext_pros, ext_cons: 800000 800000
X_train, y_train: 1602000 1602000
X_test: 500


Сделаем поиск по сетке для подбора оптимальных параметров

In [63]:
parameters = {
    'vectorizer__min_df': (1, 5, 0.7),
    'vectorizer__max_df': (1.0, 0.6),
    'vectorizer__ngram_range': ((1, 4), (1, 5), (1, 3), (2, 8), (1, 6)),
    'vectorizer__max_features': (None, 400, 4000),
    'vectorizer__analyzer': ('char_wb'), 
    'vectorizer__stop_words': (None),
    'vectorizer__binary': (True, False),
    'transformer__use_idf' : (True, False)
}

In [67]:
# Для скорости будем использовать часть выборки X_train
X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train, test_size=0.8)

# Перебираем 3 классификатора
clfs = [LinearSVC, SGDClassifier, LogisticRegression]
for clf in clfs:
    vectorizer = StemmedCountVectorizer(
        tokenizer=LemmaTokenizer(),
        lowercase=True
    )
    
    transformer = TfidfTransformer()
    classifier = clf()
    
    grid_search = GridSearchCV(
        pipe(vectorizer, transformer, classifier),
        param_grid = parameters, 
        verbose=1,
    scoring=)
    
    grid_search.fit(X_tr, y_tr)
    best_parameters = grid_search.best_estimator_.get_params()
    
    print(clf)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    print('')

ValueError: Parameter values for parameter (vectorizer__analyzer) need to be a sequence(but not a string) or np.ndarray.

Итак, построим модель с подобранными параметрами:
<class 'sklearn.svm.classes.LinearSVC'>
Best score: 0.869
Best parameters set:
	transformer__use_idf: True
	vectorizer__analyzer: 'char_wb'
	vectorizer__binary: False
	vectorizer__max_df: 0.6
	vectorizer__max_features: None
	vectorizer__min_df: 1
	vectorizer__ngram_range: (1, 5)
	vectorizer__stop_words: None

In [35]:
vectorizer = StemmedCountVectorizer( 
    stop_words=None,
    analyzer='char_wb',
    min_df=1,
    max_df=0.6,
    ngram_range=(1, 5),
    max_features=None,
    tokenizer=LemmaTokenizer(),
    binary=False,
    lowercase=True
)
transformer = TfidfTransformer(
    use_idf=True
)
classifier = LinearSVC(class_weight='balanced')
vectorizer = pipe(vectorizer, transformer, classifier)
vectorizer = vectorizer.fit(X_train, y_train)
predicted = vectorizer.predict(X_test)

In [36]:
open("pred.csv", 'w').close() # Создаём / очищаем файл
with open("pred.csv", 'a') as f:
    f.write('Id,y')
    for i in range(len(predicted)):
        y = str(predicted[i])
        f.write('\n' + str(i) + ',' + y)