In [1]:
import re
import csv
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import groupby
from collections import namedtuple
from pandas.io.json import json_normalize


import nltk
import pymorphy2
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize


from gensim.models import Doc2Vec
from gensim.models import Word2Vec
from gensim.models import KeyedVectors


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.cluster import KMeans
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier



In [2]:
def cleaning(text):
    text = re.sub(r'[^\w\s]','',text)
    tokenized = []
    words = word_tokenize(text)
    for word in words:
        p = morph.parse(word)[0]
        tokenized.append(p.normal_form)
    tokenized = [token for token in tokenized if token not in stopWords\
                and token != " " \
                and token != "—" \
                and token != "«" \
                and token != "»" \
                and token != ".." \
                and token.strip() not in punctuation]
    return tokenized

In [3]:
file = 'classic_poems.json'
with open(file, 'r', encoding='utf-8') as fh: #открываем файл на чтение
    data = json.load(fh)
morph = pymorphy2.MorphAnalyzer()
stopWords = stopwords.words('russian')

In [4]:
poems = pd.DataFrame(data)
blok = poems[lambda x: x['poet_id'] == 'blok']
cleaned_blok = blok['content'].apply(cleaning)
dfpoems = cleaned_blok
Blokpoems = dfpoems.tolist()

In [5]:
dictionary = pd.read_csv('WordRate.csv', sep=';', index_col=0, encoding = 'windows-1251')
sentiments = dictionary['average rate']
positive1 = dictionary[lambda x: x['average rate'] == 1]
positive2 = dictionary[lambda x: x['average rate'] == 2]
negative1 = dictionary[lambda x: x['average rate'] == -1]
negative2 = dictionary[lambda x: x['average rate'] == -2]
neutral = dictionary[lambda x: x['average rate'] == 0]
posdf1 = positive1['average rate']
posdf2 = positive2['average rate']
negdf1 = negative1['average rate']
negdf2 = negative2['average rate']
neudf = neutral['average rate']
pos1 = posdf1.index
pos2 = posdf2.index
neg1 = negdf1.index
neg2 = negdf2.index
neu = neudf.index

In [6]:
def merge (Blokpoems):
    all=[]
    for lst in Blokpoems:
      all.extend(lst)
    return all
words = merge(Blokpoems)
vocab = set(words)

In [7]:
def BoW (text, vocab):
    vecttext = []
    i = 0
    for el in vocab:
        counter = 0
        for word in text:
            if el == word:
                counter = 1
                break
        vecttext.append(counter)
        i+=1
    return vecttext

In [8]:
def TFIDF (poems):
    arrpoems =[]
    for i in poems:
        string = ' '.join(i)
        arrpoems.append(string)
    vectorizer = TfidfVectorizer()
    matrix = vectorizer.fit_transform(arrpoems).todense()
    matrix = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())
    return matrix

In [10]:
TextVectors = []
for text in Blokpoems:
    vector = BoW(text, vocab)
    TextVectors.append(vector)

In [11]:
tfidfplain = TFIDF(Blokpoems)

In [22]:
sents = []
for word in vocab:
    if word in pos1:
        sents.append(1)
    elif word in pos2:
        sents.append(2)
    elif word in neu:
        sents.append(0)
    elif word in neg1:
        sents.append(-1)
    elif word in neg2:
        sents.append(-2)
    else:
        sents.append(0)

In [13]:
dicti = list(vocab)

In [42]:
newDocVecs = []
text_n = 0
for t in TextVectors:
    vec = []
    word_n = 0
    for w in t:
        if dicti[word_n] in tfidfplain:
            word = dicti[word_n]
            index = TextVectors[text_n][word_n]*tfidfplain[word][text_n]
        else:
            index = 0
        vec.append(index)
        word_n += 1
    vec = np.array(vec)
    newDocVecs.append(vec)
    text_n += 1
        

In [43]:
kmeans = KMeans(n_clusters=2, max_iter = 500, random_state=50).fit(newDocVecs)
lab = kmeans.labels_

In [47]:
newDocVecs = []
text_n = 0
for t in TextVectors:
    vec = []
    word_n = 0
    for w in t:
        if dicti[word_n] in tfidfplain:
            word = dicti[word_n]
            index = TextVectors[text_n][word_n]*tfidfplain[word][text_n]*sents[word_n]
        else:
            index = 0
        vec.append(index)
        word_n += 1
    vec = np.array(vec)
    newDocVecs.append(vec)
    text_n += 1

In [48]:
kmeans = KMeans(n_clusters=2, max_iter = 500, random_state=50).fit(newDocVecs)
lab = kmeans.labels_