## Topic Modeling

In [10]:
import pandas as pd
import tqdm
import regex
import os
import re
import shutil
import matplotlib.pyplot as plt
import artm
import codecs
from seaborn import heatmap
import time
from math import log
import operator
import numpy as np
from nltk import sent_tokenize
from pymystem3 import Mystem
import cPickle as pickle
%matplotlib inline

In [12]:
from sklearn.cross_validation import train_test_split
from gensim.models.word2vec import Word2Vec

with open('extra_data/prepared_tweets.txt', 'r') as infile:
    tweets = infile.readlines()
    
with open('extra_data/targets.txt', 'r') as infile:
    y = infile.readlines()

In [13]:
x_train, x_test, y_train, y_test = train_test_split(tweets, y, test_size=0.2)

In [14]:
def cleanText(corpus):
    corpus = [z.lower().replace('\n','').split() for z in corpus]
    return corpus

x_train = cleanText(x_train)
x_test = cleanText(x_test)

In [15]:
y_train = [int(y.strip()) for y in y_train]
y_test = [int(y.strip()) for y in y_test]

In [24]:
MIN_WORD_LEN = 2
MAX_WORD_LEN = 20
MIN_WORD_COUNT = 5
def filter_words_by_length(text):
    return [word for word in text if len(word) >= MIN_WORD_LEN and len(word) <= MAX_WORD_LEN]
def filter_texts_by_length(texts):
    return [text for text in texts if len(text) >= MIN_WORD_COUNT]

In [22]:
x_train = [filter_words_by_length(text) for text in x_train]

In [25]:
len(x_train)

181467

In [26]:
x_train = filter_texts_by_length(x_train)

In [27]:
len(x_train)

167743

### Формируем словарь коллекции из (1,3) - грамм

In [31]:
from collections import defaultdict

In [173]:
def make_dict(data): 
    corpus_dict = defaultdict(int)
    for text in data:
        MIN_WORD_LEN = 3
        text = [word for word in text if len(word) >= MIN_WORD_LEN]
        bigrams = [k[0]+"_"+k[1] for k in zip(text[:-1], text[1:])]
        trigrams = [k[0]+"_"+k[1]+"_"+k[2] for k in zip(text[:-2], text[1:-1], text[2:])]
        terms = text + bigrams + trigrams
        for term in terms:
            corpus_dict[term]+=1
    return corpus_dict

In [174]:
corpus_dict = make_dict(x_train)

In [175]:
len(corpus_dict)

1923175

In [176]:
def filter_vocab(corpus_dict):
    vocab = defaultdict(int)
    MIN_DF = 5
    for term, freq in corpus_dict.iteritems():
        if freq >= MIN_DF and len(term) >= MIN_WORD_LEN:
            vocab[term]+=1
    return vocab

In [177]:
vocab = filter_vocab(corpus_dict)

In [183]:
type(vocab.keys()[0].decode('utf-8'))

unicode

### Формируем файлы в формате Vowpal Wabbit

In [126]:
def make_vw(texts):
    output_vw = []
    MIN_DF = 5
    MIN_WORD_LEN = 3
    for index, text in tqdm.tqdm(enumerate(texts)):
        vw_dict = defaultdict(int)
        text = [word for word in text if len(word.strip()) >= MIN_WORD_LEN]
        bigrams = [k[0]+"_"+k[1] for k in zip(text[:-1], text[1:])]
        trigrams = [k[0]+"_"+k[1]+"_"+k[2] for k in zip(text[:-2], text[1:-1], text[2:])]
        terms = text + bigrams + trigrams
        #print ' '.join(terms)
        for word in terms:
            vw_dict[word]+=1
        sentence = [u"{}:{}".format(term.decode('utf-8'), frequency) for term, frequency in vw_dict.iteritems()
                   if corpus_dict[term] >= MIN_DF ]
        output_vw.append(u"{} |@default_class {}".format(index, " ".join(sentence)))
    return output_vw

In [127]:
vw_train = make_vw(x_train)

167743it [00:38, 4306.21it/s]


In [130]:
print len(vw_train)

167743


In [132]:
with codecs.open("extra_data/" + "vw.txt", "w", encoding="utf-8") as output:
    print >> output, "\n".join(vw_train)

In [184]:
with codecs.open("extra_data/" + "vocab.txt", "w", encoding="utf-8") as output:
    print >> output, "\n".join([key.decode('utf-8') for key in vocab.keys()])

## Make model

In [189]:
FOLDER_DATA = 'extra_data/'
FOLDER_BATCHES = 'batches/'
PATH_TO_VW = FOLDER_DATA + 'vw.txt'
FOLDER_BATCHES = FOLDER_BATCHES + "replies"
VOCAB_PATH = FOLDER_DATA + "vocab.txt"

In [190]:
bv = artm.BatchVectorizer(data_path=PATH_TO_VW, data_format="vowpal_wabbit",
                          target_folder=FOLDER_BATCHES, gather_dictionary=True)