In [1]:
import re
import csv
import json
import numpy as np
import pandas as pd
from itertools import groupby
from collections import namedtuple
from pandas.io.json import json_normalize

import nltk
import pymorphy2
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

from gensim.models import Doc2Vec
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.cluster import KMeans



In [2]:
def cleaning(text):
    text = re.sub(r'[^\w\s]','',text)
    tokenized = []
    words = word_tokenize(text)
    for word in words:
        p = morph.parse(word)[0]
        tokenized.append(p.normal_form)
    tokenized = [token for token in tokenized if token not in stopWords\
                and token != " " \
                and token != "—" \
                and token != "«" \
                and token != "»" \
                and token != ".." \
                and token.strip() not in punctuation]
    return tokenized

In [3]:
def TFIDF (poems):
    arrpoems =[]
    for i in poems:
        string = ' '.join(i)
        arrpoems.append(string)
    vectorizer = TfidfVectorizer()
    matrix = vectorizer.fit_transform(arrpoems).todense()
    matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
    return matrix

In [4]:
#Sentiments from corpora
dictionary = pd.read_csv('WordRate.csv', sep=';', index_col=0, encoding = 'windows-1251')
sentiments = dictionary['average rate']
positive1 = dictionary[lambda x: x['average rate'] == 1]
positive2 = dictionary[lambda x: x['average rate'] == 2]
negative1 = dictionary[lambda x: x['average rate'] == -1]
negative2 = dictionary[lambda x: x['average rate'] == -2]
neutral = dictionary[lambda x: x['average rate'] == 0]
posdf1 = positive1['average rate']
posdf2 = positive2['average rate']
negdf1 = negative1['average rate']
negdf2 = negative2['average rate']
neudf = neutral['average rate']
pos1 = posdf1.index
pos2 = posdf2.index
neg1 = negdf1.index
neg2 = negdf2.index
neu = neudf.index

In [5]:
#Blok
file = 'classic_poems.json'
with open(file, 'r', encoding='utf-8') as fh: #открываем файл на чтение
    data = json.load(fh)
morph = pymorphy2.MorphAnalyzer()
stopWords = stopwords.words('russian')


In [6]:
poems = pd.DataFrame(data)
blok = poems[lambda x: x['poet_id'] == 'blok']
cleaned_blok = blok['content'].apply(cleaning)
dfpoems = cleaned_blok
Blokpoems = dfpoems.tolist()

In [7]:
model = Word2Vec(Blokpoems, size=100, window=5, min_count=1, iter=10)
model.init_sims(replace = True)
model.save("word2vec.model")
wv = KeyedVectors.load("word2vec.model", mmap='r')
allwords = model.wv.vocab
words = [i for i in allwords]
vocabulary = sorted(words)
vectors = []
for word in vocabulary:
    vectors.append(np.array(model.wv[word]))

In [8]:
tfidfplain = TFIDF(Blokpoems)

In [9]:
#Sentiments from corpora
sents = []
for word in vocabulary:
    if word in pos1:
        sents.append(3)
    elif word in pos2:
        sents.append(4)
    elif word in neu:
        sents.append(2)
    elif word in neg1:
        sents.append(1)
    elif word in neg2:
        sents.append(0)
    else:
        sents.append(-1)

In [10]:
labeled_spr = LabelSpreading(kernel='knn')
labelsent = labeled_spr.fit(vectors, sents)
labels = labelsent.transduction_


In [11]:
labels = labels - 2

In [12]:
def wvtodv (text, n, vecs, tfidf, labels):
    updvec = np.zeros((1,100))
    for word in text:
        w2v = np.array(vecs[word])
        if word in tfidf:
            index = [i for i,x in enumerate(vocabulary) if word in x][0]
            ti = tfidf[word][n]
            new = w2v*ti*labels[index]
        else :
            new = w2v
        updvec = updvec + new
    return updvec

In [14]:
tf = []
for i in range (0,1077):
    l = len(Blokpoems[i])
    vec = wvtodv(Blokpoems[i], i, model.wv, tfidfplain, labels)[0]
    docvec = vec/l
    tf.append(np.array(docvec))


In [15]:
kmeans = KMeans(n_clusters=2, max_iter = 500, random_state=170).fit(tf)
lab = kmeans.labels_