In [1]:
import pickle
import pandas as pd
import numpy as np
from zipfile import ZipFile
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
with ZipFile('../data/train.csv.zip') as z:
    with z.open('train.csv') as f:
        train_df = pd.read_csv(f)

In [3]:
with ZipFile('../data/test.csv.zip') as z:
    with z.open('test.csv') as f:
        test_df = pd.read_csv(f)

In [4]:
joint_df = pd.concat((train_df, test_df))

In [5]:
del train_df, test_df

In [6]:
from nltk.corpus import stopwords

In [7]:
stop_en = stopwords.words('english')
stop_ru = stopwords.words('russian')
stop = set(stop_en + stop_ru)
stop |= set([
    'м', 'эт', 'м²', 'сот', 'р', 'в', 'т', 'д',
])

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import re
from pymorphy2 import MorphAnalyzer

In [None]:
token_re = re.compile(r'[0-9]*[^\W\d]+[0-9]*')
ma = MorphAnalyzer()

In [None]:
def tokenizer(text):
    tokens = token_re.findall(text)
    tokens = [ma.normal_forms(x)[0] for x in tokens]
    return tokens

In [9]:
tfv = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=stop, max_features=50000,
                      ngram_range=(1, 2), token_pattern=r'[0-9]*[^\W\d]+[0-9]*')

In [10]:
joint_df['description'] = joint_df['description'].fillna('NONE')

In [11]:
tfidf = tfv.fit_transform(joint_df['description'])

In [None]:
from sklearn.decomposition import NMF

In [None]:
nmf = NMF(n_components=20)

In [None]:
comp20 = nmf.fit_transform(tfidf)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" - ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [None]:
print("Topics found via NMF:")
tfidf_feature_names = tfv.get_feature_names()
print_top_words(nmf, tfidf_feature_names, 10)

In [12]:
from scipy.sparse import save_npz

In [13]:
train_tfidf = tfidf[:1503424]

In [14]:
test_tfidf = tfidf[1503424:]

In [15]:
train_tfidf.shape

(1503424, 50000)

In [16]:
test_tfidf.shape

(508438, 50000)

In [17]:
with open('./text_features/train/description_tfidf_50000.npz', 'wb') as f:
    save_npz(f, train_tfidf)

In [18]:
with open('./text_features/test/description_tfidf_50000.npz', 'wb') as f:
    save_npz(f, test_tfidf)

In [19]:
with open('./text_features/description_tfidf_50000_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfv, f)

In [None]:
from pymorphy2 import MorphAnalyzer

In [None]:
an = MorphAnalyzer()

In [None]:
an.normal_forms('sdfasd')[0]

## Bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
count_v = CountVectorizer(max_features=2000, max_df=0.9, min_df=2, ngram_range=(1, 2))

In [None]:
bof = count_v.fit_transform(joint_df['description'])

In [None]:
from scipy.sparse import save_npz

In [None]:
train_bof = tfidf[:1503424]

In [None]:
test_bof = tfidf[1503424:]

In [None]:
train_bof.shape

In [None]:
test_bof.shape

In [None]:
with open('./text_features/train/description_bof.npz', 'wb') as f:
    save_npz(f, train_bof)

In [None]:
with open('./text_features/test/description_bof.npz', 'wb') as f:
    save_npz(f, test_bof)

In [None]:
with open('./text_features/dewscription_bof_vectorizer.pkl', 'wb') as f:
    pickle.dump(count_v, f)