In [26]:
import re
import csv
import json
import numpy as np
import pandas as pd
from itertools import groupby
from collections import namedtuple
from pandas.io.json import json_normalize

import nltk
import pymorphy2
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

from gensim.models import Doc2Vec
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.cluster import KMeans
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
def cleaning(text):
    text = re.sub(r'[^\w\s]','',text)
    tokenized = []
    words = word_tokenize(text)
    for word in words:
        p = morph.parse(word)[0]
        tokenized.append(p.normal_form)
    tokenized = [token for token in tokenized if token not in stopWords\
                and token != " " \
                and token != "—" \
                and token != "«" \
                and token != "»" \
                and token != ".." \
                and token.strip() not in punctuation]
    return tokenized

In [3]:
def shaping (rev):
    reviews = []
    analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
    for i, text in enumerate(rev):
        tags = [i]
        reviews.append(analyzedDocument(text, tags))
    return reviews

In [4]:
def TFIDF (poems):
    arrpoems =[]
    for i in poems:
        string = ' '.join(i)
        arrpoems.append(string)
    vectorizer = TfidfVectorizer()
    matrix = vectorizer.fit_transform(arrpoems).todense()
    matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
    return matrix

In [5]:
def wvtodv (text, n, vecs, tfidf, labels):
    updvec = np.zeros((1,100))
    for word in text:
        w2v = np.array(vecs[word])
        if word in tfidf:
            ti = tfidf[word][n]
            new = w2v*ti*labels[i]
        else :
            new = w2v
        updvec = updvec + new
    return updvec

In [6]:
dictionary = pd.read_csv('WordRate.csv', sep=';', index_col=0, encoding = 'windows-1251')
sentiments = dictionary['average rate']
positive1 = dictionary[lambda x: x['average rate'] == 1]
positive2 = dictionary[lambda x: x['average rate'] == 2]
negative1 = dictionary[lambda x: x['average rate'] == -1]
negative2 = dictionary[lambda x: x['average rate'] == -2]
neutral = dictionary[lambda x: x['average rate'] == 0]
posdf1 = positive1['average rate']
posdf2 = positive2['average rate']
negdf1 = negative1['average rate']
negdf2 = negative2['average rate']
neudf = neutral['average rate']
pos1 = posdf1.index
pos2 = posdf2.index
neg1 = negdf1.index
neg2 = negdf2.index
neu = neudf.index

In [7]:
#Blok
file = 'classic_poems.json'
with open(file, 'r', encoding='utf-8') as fh: #открываем файл на чтение
    data = json.load(fh)
morph = pymorphy2.MorphAnalyzer()
stopWords = stopwords.words('russian')

In [8]:
#Preprocessing
poems = pd.DataFrame(data)
blok = poems[lambda x: x['poet_id'] == 'blok']
cleaned_blok = blok['content'].apply(cleaning)
dfpoems = cleaned_blok
Blokpoems = dfpoems.tolist()
blok_train, blok_test = train_test_split(Blokpoems, test_size=0.5, random_state=50)

In [9]:
modeltrain = Word2Vec(blok_train, size=100, window=5, min_count=1, iter=10)
modeltrain.init_sims(replace = True)
modeltrain.save("word2vec.model")
wv = KeyedVectors.load("word2vec.model", mmap='r')
allwords2 = modeltrain.wv.vocab
words2 = [i for i in allwords2]
vocabularytrain = sorted(words2)
vectorstrain = []
for word in vocabularytrain:
    vectorstrain.append(np.array(modeltrain.wv[word]))


In [10]:
tfidfplain = TFIDF(blok_train)

In [11]:
sents = []
for word in vocabularytrain:
    if word in pos1:
        sents.append(3)
    elif word in pos2:
        sents.append(4)
    elif word in neu:
        sents.append(2)
    elif word in neg1:
        sents.append(1)
    elif word in neg2:
        sents.append(0)
    else:
        sents.append(-1)

In [12]:
labeled_spr = LabelSpreading(kernel='knn')
labelsent = labeled_spr.fit(vectorstrain, sents)
labels = labelsent.transduction_

In [13]:
tf = []
for i in range (0,538):
    vec = wvtodv(blok_train[i], i, modeltrain.wv, tfidfplain, labels)[0]
    tf.append(np.array(vec))

In [14]:
kmeans = KMeans(n_clusters=2, max_iter = 500, random_state=50).fit(tf)
lab = kmeans.labels_

In [23]:
modeltest = Word2Vec(blok_test, size=100, window=5, min_count=1, iter=10)
modeltest.init_sims(replace = True)
modeltest.save("word2vec.model")
wv = KeyedVectors.load("word2vec.model", mmap='r')
allwordstest = modeltest.wv.vocab
wordstest = [i for i in allwordstest]
vocabularytest = sorted(wordstest)
vectorstest = []
for word in vocabularytest:
    vectorstest.append(np.array(modeltest.wv[word]))

In [44]:
tfidftest = TFIDF(blok_test)

In [50]:
tftest = []
for i in range (0,539):
    updvec = np.zeros((1,100))
    for word in blok_test[i]:
        vec = np.array(modeltest.wv[word])
        if word in tfidftest:
            ti = tfidftest[word][i]
            new = vec*ti
        else :
            new = vec
        updvec = updvec + new
    tftest.append(updvec[0])

In [51]:
bc = BaggingClassifier(n_estimators=100, random_state=50)
trees = model.fit(tf, lab)
predBC = trees.predict(tftest)

In [52]:
clf = LogisticRegression(random_state=50)
lr = clf.fit(tf, lab)
predLR = clf.predict(tftest)



In [54]:
predLR

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,