In [2]:
import numpy as np
import os
import gensim
import pymystem3
import re

In [3]:
def prepare_group(mypath):
    files = []
    stem = pymystem3.Mystem()
    
    for r, d, f in os.walk(mypath):
        for file in f:
            files.append(os.path.join(r, file))
        
    docs = []
    full = ''
    wordset = set()
    
    for filepath in files:
        corpus = []
        with open(filepath) as file:
            for line in file:
                corpus.append(line)
        corpus = [el for el in corpus if el != '\n']
        
        corpus = [re.sub(r'[^\w\s]','',el)[:-1] for el in corpus]
        
        for el in corpus:
            words = el.lower()
            proc = stem.lemmatize(words)
            proc = [w for w in proc if (w.strip() != '') and (w != '\n')]
            docs.append(proc)
            
            for word in proc:
                full += ' ' + word
                wordset.add(word)
        print(filepath)

    return docs, wordset, full

In [4]:
docs_c, wordset_c, full_c = prepare_group('./books/classic')
docs_t, wordset_t, full_t = prepare_group('./books/trash')

./books/classic/jivago.txt
./books/classic/warandpeace.txt
./books/classic/govardtales.txt
./books/classic/brotherskaramasovi.txt
./books/classic/atthemountainsofmadness.txt
./books/classic/annakarenina.txt
./books/classic/oblomov.txt
./books/classic/idiot.txt
./books/classic/journeytomoscow.txt
./books/classic/onegin.txt
./books/classic/ktulhu_demo.txt
./books/classic/masterandmargaret.txt
./books/trash/lullaby.txt
./books/trash/survivor.txt
./books/trash/fameless.txt
./books/trash/chapaev_demo.txt
./books/trash/Pelevin.txt
./books/trash/gruz200.txt
./books/trash/rastamans.txt
./books/trash/oneflewover.txt
./books/trash/chapaevandvoid.txt
./books/trash/dogma.txt
./books/trash/snatch.txt
./books/trash/trainspotting.txt
./books/trash/brat.txt
./books/trash/pulpfiction.txt
./books/trash/mitiki.txt
./books/trash/messenger.txt
./books/trash/reservoirdogs.txt
./books/trash/pelevintales.txt
./books/trash/choke.txt
./books/trash/piratesofthe.txt
./books/trash/fightclub.txt


In [5]:
texts = docs_c + docs_t
fulls = [full_c, full_t]
sets = [wordset_c, wordset_t]

In [7]:
model = gensim.models.Word2Vec(texts, size=100, iter=10, workers=8, min_count=1)

In [49]:
model1 = gensim.models.Word2Vec(texts, size=100, iter=5, workers=8, min_count=1, window=5)

In [71]:
model.wv.most_similar('фигня')

[('дерьмо', 0.8320658206939697),
 ('стих', 0.8217606544494629),
 ('неприятность', 0.821404755115509),
 ('мировой', 0.8191081881523132),
 ('ритуал', 0.8183972835540771),
 ('способ', 0.817700982093811),
 ('статус', 0.8173060417175293),
 ('барсик', 0.8147743940353394),
 ('санкция', 0.8146173357963562),
 ('зависимость', 0.8144347667694092)]

In [73]:
word = 'дерьмо'
index = fnames.index(word)
idf = npidf[:, index]
idf

array([9.37722147e-06, 1.66050262e-03])

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer()
tfidf = vec.fit_transform(fulls)
npidf = tfidf.toarray()
fnames = vec.get_feature_names()

In [None]:
def choose_by_tfidf(fnames, tfidf, word):
    try:
        index = fnames.index(word)
        idf = npidf[:, index]
        return idf[1] > 100 * idf[0]
    except:
        return False

In [14]:
def logic(string):
    stem = pymystem3.Mystem()
    
    for word in stem.lemmatize(string):
        if word.strip() == '':
            continue
        try:
            if choose_by_tfidf(fnames, tfidf, word):
                best = '0'
                best_res = -100
                for gword in sets[0]:
                    score = model.wv.similarity(word, gword)
                    if score > best_res:
                        best_res = score
                        best = gword
                print(word, ' ---> ', best)
        except:
            pass

In [70]:
test = 'фигня'
logic(test)

фигня  --->  дерьмо


In [9]:
def select_by_cos(word, sets, model):
    best = '0'
    best_res = -100
    for gword in sets[0]:
        score = model.wv.similarity(word, gword)
        if score > best_res:
            best_res = score
            best = gword
    return best, best_res

In [16]:
def production(string, model, texts, fulls, sets, fnames, tfidf):
    res = []
    stem = pymystem3.Mystem()
    proc = stem.lemmatize(string)
    corpus = [re.sub(r'[^\w\s]','',el) for el in proc]
    corpus = [el for el in corpus if el.strip() != '']
    
    for word in corpus:
        if choose_by_tfidf(fnames, tfidf, word):
            chose, pos = select_by_cos(word, sets, model)
            res.append([word, chose, pos])
    return res

In [89]:
production('Вася! я тут такое, блин, придумал', model, texts, fulls, sets)

['вася', 'я', 'тут', 'такой', 'блин', 'придумывать']


In [10]:
model.predict_output_word(['я', 'идти', 'дом'])

[('мимо', 0.013590944),
 ('ко', 0.00476735),
 ('домой', 0.003569626),
 ('куда', 0.0035491132),
 ('гулять', 0.0034510256),
 ('улица', 0.0033766152),
 ('далеко', 0.0026232267),
 ('отсюда', 0.0024589528),
 ('пешком', 0.0020692933),
 ('туда', 0.0019615763)]

In [11]:
model.most_similar('растаман')

  """Entry point for launching an IPython kernel.


[('чувак', 0.7908152937889099),
 ('джа', 0.7688435316085815),
 ('герло', 0.7603917121887207),
 ('hу', 0.7603676319122314),
 ('обломаться', 0.7556824684143066),
 ('блин', 0.7450451850891113),
 ('ганджа', 0.744154155254364),
 ('психиатор', 0.7407350540161133),
 ('короче', 0.7357670664787292),
 ('вишь', 0.7345012426376343)]