In [1]:
import json

import bz2
import regex
from tqdm import tqdm
from scipy import sparse

In [2]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
responses = []
with bz2.BZ2File('banki_responses.json.bz2', 'r') as thefile:
    for row in tqdm(thefile):
        resp = json.loads(row)
        if not resp['rating_not_checked'] and (len(resp['text'].split()) > 0):
            responses.append(resp)

201030it [01:49, 1827.56it/s]


In [4]:
counter = {}
for resp in responses:
    if resp['bank_name'] in counter:
        counter[resp['bank_name']] += 1
    else:
        counter[resp['bank_name']] = 1

In [5]:
df = pd.DataFrame()

lens = []
symbols = []
texts = []
for resp in responses:
    symbols.append(len(resp['text']))
    lens.append(len(resp['text'].split(' ')))
    texts.append(resp['text'])

grades = []
for resp in responses:
    grades.append(resp['rating_grade'])

df['lens'] = lens
df['symbols'] = symbols
df['texts'] = texts
df['grades'] = grades

In [6]:
from nltk.tokenize import sent_tokenize

In [7]:
df1 = df[(df.grades == 1) | (df.grades ==5)]

In [8]:
df1.shape[0], df.shape[0]

(62100, 153499)

In [9]:
df1['sents'] = df1.texts.apply(sent_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [10]:
sentences = []
for _, row in df1.iterrows():
    for sent in row.sents:
        sentences.append(sent)

In [11]:
import re
regex = re.compile("[А-Яа-я]+")

def words_only(text, regex=regex):
    try:
        return " ".join(regex.findall(text))
    except:
        return ""


# df.texts = df.texts.str.lower()
# df.texts = df.texts.apply(words_only)

In [12]:
for i in range(len(sentences)):
    sentences[i] = sentences[i].lower()
    sentences[i] = words_only(sentences[i])

In [13]:
sentences[99]

'нет все в пределах правил со странной улыбкой отвечает сотрудница просто замечательно думаю я надо сказать что диалог наш продолжался какое то время а операционисток было всего двое и клиенты освободившиеся с работы приходили по своим вопросам в банк соответственно скопилась очередь'

In [14]:
from pymorphy2 import MorphAnalyzer

In [15]:
from nltk.corpus import stopwords

mystopwords = stopwords.words('russian') + ['это', 'наш' , 'тыс', 'млн', 'млрд', 'также',  'т', 'д']
def  remove_stopwords(text, mystopwords = mystopwords):
    try:
        return " ".join([token for token in text.split() if not token in mystopwords])
    except:
        return ""

def lemmatize(text):
    m = MorphAnalyzer()
    try:
        lemmas1 = [m.parse(word)[0].normal_form for word in text.split()]
        return ' '.join(lemmas1) 
    except:
        return " "

mystoplemmas = ['который','прошлый','сей', 'свой', 'наш', 'мочь']
def  remove_stoplemmas(text, mystoplemmas = mystoplemmas):
    try:
        return " ".join([token for token in text.split() if not token in mystoplemmas])
    except:
        return ""

In [16]:
texts = [sentences[i].split() for i in range(len(sentences))]

#### Обучение word2vec

In [17]:
%%time
from gensim.models import Word2Vec
model = Word2Vec(texts, size=100, window=5, min_count=5, workers=4)
model.save("sent_w2v.model")

Wall time: 1min 1s


In [19]:
model.most_similar("россия")[:4]

  """Entry point for launching an IPython kernel.


[('столица', 0.7434579133987427),
 ('российская', 0.7340778708457947),
 ('обл', 0.7270342111587524),
 ('область', 0.7225314378738403)]

In [20]:
model.most_similar(positive=["доллар","надежный"], negative=["плохой"])[:4]

  """Entry point for launching an IPython kernel.


[('долларах', 0.608579158782959),
 ('евро', 0.5920159816741943),
 ('эквивалент', 0.5798343420028687),
 ('сша', 0.5702683329582214)]

In [21]:
model.most_similar(positive=["сотрудник","вежливый"], negative=["хам"])[:4]

  """Entry point for launching an IPython kernel.


[('специалист', 0.5671907663345337),
 ('представитель', 0.5593234300613403),
 ('оператор', 0.5378738641738892),
 ('консультант', 0.5374178886413574)]

In [22]:
model.doesnt_match("салат мороженое суп окно".split())

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'окно'

#### Визуализация

In [23]:
top_words = []
from nltk import FreqDist
fd = FreqDist()
for text in texts:
    fd.update(text)
for i in fd.most_common(1000):
    top_words.append(i[0])

In [24]:
top_words_vec = model[top_words]

  """Entry point for launching an IPython kernel.


In [25]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
top_words_tsne = tsne.fit_transform(top_words_vec)

In [26]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=top_words_tsne[:,0],
                                    x2=top_words_tsne[:,1],
                                    names=top_words))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

In [27]:
banks = []
x1_b = []
x2_b = []
names_b = []
for key in counter:
    name = key.lower().split()
    for i in range(len(name)):
        try:            
            x = (model['хорошо'] - model['плохо']).dot(model[name[i]])
            y = (model['быстро'] - model['медленно']).dot(model[name[i]])
            banks.append([name[i],x,y])
            x1_b.append(x)
            x2_b.append(y)
            names_b.append(name[i])
        except:
            continue

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [28]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=x1_b,
                                    x2=x2_b,
                                    names=names_b))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

#### Часть 2. Распространение метки [50 баллов]

In [29]:
import numpy as np
from sklearn import datasets
from sklearn.semi_supervised import LabelSpreading

positives = ['быстрый', 'помогающий', 'качественный', 'удобный', 'одобрение']
negatives = ['отвратительный', 'очередь', 'плохо', 'проблема', 'ненадежный', 'ужас']

In [30]:
vectors = pd.DataFrame(data=model.wv[model.wv.vocab.keys()])
vectors['word'] = model.wv.vocab.keys()

In [31]:
vectors['TARGET'] = vectors.apply(lambda row: 1 if row['word'] in positives else 0 if row['word'] 
                                  in negatives else -1, axis=1)
y = vectors['TARGET']
X = vectors.drop(['word', 'TARGET'], axis=1)

# Подбор позитивных меток
label_prop_model = LabelSpreading(kernel='knn', alpha=0.2, n_neighbors=5, max_iter=5, tol=0.3)
label_prop_model.fit(X, y)

vectors['results'] = label_prop_model.transduction_
vectors[(vectors.results==1)][['word', 'TARGET', 'results']]

  self.label_distributions_ /= normalizer


Unnamed: 0,word,TARGET,results
5,плюс,-1,1
458,маленький,-1,1
742,какой,-1,1
1068,простой,-1,1
1152,большой,-1,1
1174,балл,-1,1
1179,такой,-1,1
2026,минус,-1,1
2777,небольшой,-1,1
2826,огромный,-1,1


In [32]:
vectors['TARGET'] = vectors.apply(lambda row: 0 if row['word'] in positives else 1 if row['word'] 
                                  in negatives else -1, axis=1)
y = vectors['TARGET']
X = vectors.drop(['word', 'TARGET'], axis=1)

# Подбор негативных меток

label_prop_model = LabelSpreading(kernel='knn', alpha=0.1, n_neighbors=5, max_iter=5, tol=0.2)
label_prop_model.fit(X, y)

vectors['results'] = label_prop_model.transduction_
vectors[(vectors.results==1)][['word', 'TARGET', 'results']]

  self.label_distributions_ /= normalizer


Unnamed: 0,word,TARGET,results
80,сознательно,-1,1
307,проблема,1,1
313,как,-1,1
335,резюмирую,-1,1
460,перешел,-1,1
699,назывался,-1,1
700,международный,-1,1
890,прекрасно,-1,1
919,очередь,1,1
925,долго,-1,1
