## Создание признакового пространства

In [1]:
import string

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

In [5]:
DATA_PATH = './data/preprocessed_tweets.csv'

In [32]:
import ast

preprocessed_tweets = pd.read_csv(DATA_PATH, quotechar='"', sep=',', converters={5:ast.literal_eval,
                                                                                6:ast.literal_eval,
                                                                                7:ast.literal_eval,
                                                                                8:ast.literal_eval,})

preprocessed_tweets.drop('Unnamed: 0', axis=1, inplace=True)
preprocessed_tweets.head()

Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thanks, lyft, credit, use, cause, offer, whee..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in your,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time]","[model, love, take, time]","[model, love, take, time]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguide, society, motivation]"


### Count vectorizer

In [50]:
corpus = preprocessed_tweets['tweet_stemmed'].map(' '.join)

count_vectorizer = CountVectorizer(max_df=0.9, min_df=0, max_features=1000, stop_words='english')

# Создаем the Bag-of-Words модель
bag_of_words = count_vectorizer.fit_transform(corpus)

# Отобразим Bag-of-Words модель как DataFrame
feature_names = count_vectorizer.get_feature_names()
cv_stemmed_df = pd.DataFrame(bag_of_words.toarray(), columns = feature_names)
cv_stemmed_df.head()

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
corpus = preprocessed_tweets['tweet_lemmatized'].map(' '.join)\

count_vectorizer = CountVectorizer(max_df=0.9, min_df=0, max_features=1000, stop_words='english')

# Создаем the Bag-of-Words модель
bag_of_words = count_vectorizer.fit_transform(corpus)

# Отобразим Bag-of-Words модель как DataFrame
feature_names = count_vectorizer.get_feature_names()
cv_lemmatized_df = pd.DataFrame(bag_of_words.toarray(), columns = feature_names)
cv_lemmatized_df.head()

Unnamed: 0,able,absolutely,account,act,action,actor,actually,adapt,add,adult,...,yes,yesterday,yo,yoga,york,young,youth,youtube,yr,yummy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF

In [53]:
corpus = preprocessed_tweets['tweet_stemmed'].map(' '.join)

tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=0, max_features=1000, stop_words='english')
values = tfidf_vectorizer.fit_transform(corpus)

# Show the Model as a pandas DataFrame
feature_names = tfidf_vectorizer.get_feature_names()
tfidf_steemed_df = pd.DataFrame(values.toarray(), columns = feature_names)
tfidf_steemed_df.head()

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
corpus = preprocessed_tweets['tweet_lemmatized'].map(' '.join)

tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=0, max_features=1000, stop_words='english')
values = tfidf_vectorizer.fit_transform(corpus)

# Show the Model as a pandas DataFrame
feature_names = tfidf_vectorizer.get_feature_names()
tfidf_lemmatized_df = pd.DataFrame(values.toarray(), columns = feature_names)
tfidf_lemmatized_df.head()

Unnamed: 0,able,absolutely,account,act,action,actor,actually,adapt,add,adult,...,yes,yesterday,yo,yoga,york,young,youth,youtube,yr,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### GenSim word2vec

In [93]:
import gensim.models
import warnings
warnings.filterwarnings("ignore")

In [88]:
model = gensim.models.Word2Vec(sentences=preprocessed_tweets['tweet_token'], size=200, window=5, min_count=2, 
                               sg=1, hs=0, negative=10, workers=32, seed=34)

In [89]:
model.train(sentences=corpus, total_examples=model.corpus_count, epochs=20)

(0, 68013460)

In [108]:
result = model.similar_by_word("dinner", topn=3)
print(result)

result = model.similar_by_word("trump", topn=3)
print(result)

[('lunch', 0.8706658482551575), ('bestie', 0.857658863067627), ('familytime', 0.8485989570617676)]
[('ally', 0.7643455862998962), ('republican', 0.763067364692688), ('paladino', 0.7621865272521973)]


In [109]:
model['food']

array([-3.22200269e-01, -7.51387835e-01,  2.87613153e-01,  1.89385310e-01,
       -3.28811228e-01,  1.08633459e-01,  2.59771615e-01, -1.39136463e-01,
       -1.40886605e-01,  3.30440074e-01, -6.44045137e-03, -6.92815483e-02,
        1.03464276e-01, -2.04096600e-01, -1.91211358e-01, -1.30970836e-01,
       -2.17706621e-01, -4.25080210e-02,  5.23558319e-01,  3.29476520e-02,
       -1.23872623e-01, -3.85678798e-01, -2.23511215e-02, -5.57434142e-01,
        3.67491394e-01,  1.01377077e-01,  4.63929862e-01,  5.86753413e-02,
        1.88741714e-01,  2.74126470e-01,  5.16839549e-02,  1.92546383e-01,
       -1.96882427e-01, -2.08005741e-01, -2.50579953e-01, -1.87378272e-01,
       -2.51723140e-01,  1.29639670e-01,  9.11197141e-02, -1.64196938e-01,
       -2.10582271e-01, -1.44278202e-02, -3.19892347e-01, -1.32650241e-01,
        1.41484246e-01,  2.80395359e-01, -1.01433553e-01, -3.22639167e-01,
        4.20709811e-02, -1.79141283e-01,  1.89767465e-01, -4.07582104e-01,
        2.68681079e-01, -

### tweet_to_vec

In [130]:
model['food'].size

200

In [161]:
def tweet_to_vec(tweet):
    vec = np.zeros(model.vector_size)
    n_words = 0
    
    for word in tweet:
        try:
            vec += model[word]
            n_words += 1
        except KeyError:
            pass
    
    if n_words:    
        vec /= n_words
    
    return vec

In [162]:
tweets_vecs = preprocessed_tweets['tweet_token'].map(tweet_to_vec)

In [171]:
tweets_vecs.shape

(49159,)

In [172]:
tweets_vecs[0].shape

(200,)

## Чат бот

In [177]:
import string
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
import annoy
from gensim.models import Word2Vec, FastText
import pickle
import numpy as np
from tqdm import tqdm_notebook

In [178]:
assert False

#Small preprocess of the answers

question = None
written = False

with open("prepared_answers.txt", "w") as fout:
    with open("Otvety.txt", "r") as fin:
        for line in tqdm_notebook(fin):
            if line.startswith("---"):
                written = False
                continue
            if not written and question is not None:
                fout.write(question.replace("\t", " ").strip() + "\t" + line.replace("\t", " "))
                written = True
                question = None
                continue
            if not written:
                question = line.strip()
                continue

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [179]:
def preprocess_txt(line):
    spls = "".join(i for i in line.strip() if i not in exclude).split()
    spls = [morpher.parse(i.lower())[0].normal_form for i in spls]
    spls = [i for i in spls if i not in sw and i != ""]
    return spls

In [180]:
assert True

# Preprocess for models fitting

sentences = []

morpher = MorphAnalyzer()
sw = set(get_stop_words("ru"))
exclude = set(string.punctuation)
c = 0

with open("Otvety.txt", "r") as fin:
    for line in tqdm_notebook(fin):
        spls = preprocess_txt(line)
        sentences.append(spls)
        c += 1
        if c > 100000:
            break

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [181]:
sentences = [i for i in sentences if len(i) > 2]
sentences[0]

['вопрос', 'тдв', 'отдыхать', 'лично', 'советовать', 'завести']

In [188]:
modelW2V = Word2Vec(sentences=sentences, size=200, window=10, min_count=5)
modelFT = FastText(sentences=sentences, size=200, min_count=5, window=10)

In [190]:
w2v_index = annoy.AnnoyIndex(modelW2V.vector_size ,'angular')
ft_index = annoy.AnnoyIndex(modelFT.vector_size ,'angular')

index_map = {}
counter = 0

with open("prepared_answers.txt", "r") as f:
    for line in tqdm_notebook(f):
        n_w2v = 0
        n_ft = 0
        spls = line.split("\t")
        index_map[counter] = spls[1]
        question = preprocess_txt(spls[0])
        
        vector_w2v = np.zeros(modelW2V.vector_size)
        vector_ft = np.zeros(modelFT.vector_size)
        for word in question:
            if word in modelW2V:
                vector_w2v += modelW2V[word]
                n_w2v += 1
            if word in modelFT:
                vector_ft += modelFT[word]
                n_ft += 1
        if n_w2v > 0:
            vector_w2v = vector_w2v / n_w2v
        if n_ft > 0:
            vector_ft = vector_ft / n_ft
        w2v_index.add_item(counter, vector_w2v)
        ft_index.add_item(counter, vector_ft)
            
        counter += 1
        
        if counter > 100000:
            break

w2v_index.build(10)
ft_index.build(10)

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




True

In [194]:
def get_response(question, index, model, index_map):
    question = preprocess_txt(question)
    vector = np.zeros(model.vector_size)
    norm = 0
    for word in question:
        if word in model:
            vector += model[word]
            norm += 1
    if norm > 0:
        vector = vector / norm
    answers = index.get_nns_by_vector(vector, 3)
    return [index_map[i] for i in answers]

In [207]:
TEXT = "как бороться с насморком"

In [208]:
get_response(TEXT, w2v_index, modelW2V, index_map)

['Печеньку с чаем... пиво с чипсами... шампанское с адреналином... выбирай. \n',
 'Да здесь некоторых-которые постоянно оскорбляют, унижают женщин, создали бы для них отдельный сайт-и пусть бы идиоты сами друг перед другом куражились!). \n',
 'Ну для кого как, а кого-то любовь окрыляет, делает добрее.. \n']

In [209]:
get_response(TEXT, ft_index, modelFT, index_map)

['Живых надо бояться.... \n',
 'Давай те с начала разберёмся отчего возникает храп? <br><br> Когда человек лежит на спине, глубина дыхания у него возрастает и у него открывается рот, — это явление называется гипервентиляция легких. Именно гипервентиляция легких (или глубокое дыхание во сне) приводит в движение мягкие структуры глотки и вызывает сужение дыхательных путей. В итоге мы слышим столь ненавистные для всех звуки: «Х-р, х-р-р, х-р-р-р» . Храп – это признак избыточности легочной вентиляции.<br><br>Ведь храп можно вылечить элементарно, как дважды два четыре. Необходимо повернуть человека на бок или живот, закрыть ему рот и заставить уменьшить глубину дыхания. Таким образом, мы устраняем гипервентиляцию легких, и храп моментально прекращается.. \n',
 'Большинство людей змей не боятся, просто чувствуют к ним резкую неприязнь и отвращение, отчсти из-за внешнего вида, отчасти, думая, что они ядовитые, хотя это и не так. Есть люди, которые действительно боятся змей - это фобики, им на