In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cPickle as pickle
import tqdm
import regex
import os
import re
import shutil
import operator
import codecs
import time
import random
from math import log
from nltk import sent_tokenize
from pymystem3 import Mystem
from seaborn import heatmap
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import RussianStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

### Loading and Preprocessing 

In [2]:
header = ['id', 'tdate', 'tmane', 'ttext', 'ttype', 'trep', 'trtf', 'tfav', 'tstcount', 'tfol', 'tfrien', 'listcount']

In [3]:
pos_twitts = pd.read_csv('dataset/train_positive.csv', sep=';', names=header)
neg_twitts = pd.read_csv('dataset/train_negative.csv', sep=';', names=header)
twitts = pd.concat([pos_twitts, neg_twitts], axis=0)
twitts = twitts.sample(frac=1)

In [4]:
X = twitts.ttext.values
y = twitts.ttype.values

In [36]:
def prepare_text(text):
    text = text.lower().strip()
    text = re.sub(r'@\S+', '', text) # delete @user_name
    text = re.sub(r'#\W+', '', text) # delete #hashtag
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'[o_, O_, dd, dddd, ddd, 0-9, \", \:, \(, \), \!, \-, \;, \?, rt, \#]+', ' ', text) # delete smiles
    return text

In [29]:
X_prep = [text.decode('utf-8') for text in X]

In [8]:
mystem = Mystem()
print mystem.lemmatize("пришла")[0]

приходить


In [7]:
tokenizer = RegexpTokenizer('\w+|\S+')
r = RussianStemmer()
mystem = Mystem()

MIN_WORD_LEN = 2
MAX_WORD_LEN = 20

def tokenize(text):
    return tokenizer.tokenize(text)

def stem_text(text):
    return [r.stem(word) for word in text]

def lemmatize_text(text):
    return [mystem.lemmatize(word)[0] for word in text]

def filter_words_by_length(text):
    return [word for word in text if len(word) >= MIN_WORD_LEN and len(word) <= MAX_WORD_LEN]

In [30]:
X_tok = [tokenize(text) for text in X_prep]

In [None]:
X_stem = [stem_text(text) for text in X_tok]

In [26]:
X_lemm = [lemmatize_text(text) for text in tqdm.tqdm(X_tok)]

100%|██████████| 226834/226834 [08:23<00:00, 450.42it/s]


In [31]:
X_sent = [' '.join(words) for words in X_tok]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_sent, y, test_size=0.3, random_state=42)

In [44]:
model_logreg = Pipeline([('count', CountVectorizer(ngram_range=(1, 3), min_df=5)), 
                         ('log_reg', LogisticRegression(class_weight='balanced', C=0.1))])
model_logreg_tf_idf = Pipeline([('count', TfidfVectorizer(ngram_range=(1, 3), min_df=5)), 
                                ('log_reg', LogisticRegression(class_weight='balanced'))])

In [45]:
model_logreg.fit(X_train, y_train)

Pipeline(steps=[('count', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        s...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [34]:
model_logreg_tf_idf.fit(X_train, y_train)

Pipeline(steps=[('count', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=Tru...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [46]:
preds_logreg = model_logreg.predict(X_test)
print accuracy_score(preds_logreg, y_test)

0.762193061087


In [35]:
preds_logreg_tf_idf = model_logreg_tf_idf.predict(X_test)
print accuracy_score(preds_logreg_tf_idf, y_test)

0.748468060719


## GridSearch

In [38]:
from sklearn.model_selection import GridSearchCV

In [None]:
pipe = Pipeline([("vectorizer", CountVectorizer()), 
                ("tfidf_transformer", TfidfTransformer()),
                ("algo", LogisticRegression())])

param_dict = {
              'vectorizer__analyzer' :['word'],
              'vectorizer__ngram_range': [(1, 3)],
              'tfidf_transformer__norm':['l2'],
              'algo__C': [0.01, 0.1, 1, 10, 100],
              'algo__penalty': ['l2']}

estimator = GridSearchCV(pipe, param_dict, scoring='accuracy', verbose=True)
estimator.fit(X_sent, y)

## Загрузим словарь тональностей

In [64]:
EXTRA_DATA = "extra_data/"
SENT_DICT = EXTRA_DATA + "collection (docs&words)_2016_all_labels/full word_rating_after_coding.xlsx"

In [65]:
mystem = Mystem()

In [66]:
sentiment = pd.read_excel(SENT_DICT, header=None)
print sentiment.shape
sentiment.head(10)

(26771, 2)


Unnamed: 0,0,1
0,абажур,0
1,абажур,0
2,абажур,-1
3,абориген,-1
4,абориген,-1
5,абориген,0
6,аборт,-2
7,аборт,0
8,аборт,-1
9,аборт,0


In [67]:
sentiment = sentiment.groupby(0).mean()
sentiment.columns = ["value"]

In [69]:
print sentiment.shape
sentiment.head()

(6860, 1)


Unnamed: 0_level_0,value
0,Unnamed: 1_level_1
абажур,-0.333333
абориген,-0.666667
аборт,-0.75
абортивный,-0.25
абсолютный,0.0
