In [1]:
import pickle
import pandas as pd
import numpy as np
from scipy import sparse
from zipfile import ZipFile
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
with ZipFile('../data/train.csv.zip') as z:
    with z.open('train.csv') as f:
        train_df = pd.read_csv(f)

In [3]:
with ZipFile('../data/test.csv.zip') as z:
    with z.open('test.csv') as f:
        test_df = pd.read_csv(f)

In [4]:
train_len = len(train_df)

In [5]:
joint_df = pd.concat((train_df, test_df))

In [6]:
del train_df, test_df

In [7]:
titles = joint_df['title'].astype(str).tolist()

In [8]:
descriptions = joint_df['description'].astype(str).tolist()

In [9]:
del joint_df

In [10]:
from nltk.corpus import stopwords

In [11]:
stop_en = stopwords.words('english')
stop_ru = stopwords.words('russian')
stop = set(stop_en + stop_ru)
stop |= set([
    'м', 'эт', 'м²', 'сот', 'р', 'в', 'т', 'д',
])

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
import re
from pymorphy2 import MorphAnalyzer

In [19]:
ma = MorphAnalyzer()
normal_forms = {}
def to_normal_form(token):
    if not token in normal_forms:
        normal_forms[token] = ma.normal_forms(token)[0]
    return normal_forms[token]

In [20]:
token_re = re.compile(r'[0-9]*[^\W\d]+[0-9]*')
def tokenizer(text):
    tokens = token_re.findall(text)
    tokens = [to_normal_form(x) for x in tokens]
    return tokens

In [21]:
descr_tfv = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=stop, max_features=50000,
                      ngram_range=(1, 2), tokenizer=tokenizer)

In [22]:
descr_tfidf = descr_tfv.fit_transform(descriptions)

In [23]:
sparse.save_npz('./text_features/train/description_tfidf_50000_pymorphy.npz', descr_tfidf[:train_len])

In [24]:
sparse.save_npz('./text_features/test/description_tfidf_50000_pymorphy.npz', descr_tfidf[train_len:])

In [25]:
with open('./text_features/description_tfidf_50000_pymorphy_vectorizer.pkl', 'wb') as f:
    pickle.dump(descr_tfv, f)

In [26]:
del descr_tfidf, descr_tfv, descriptions

In [27]:
title_tfv = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=stop, max_features=7000,
                      ngram_range=(1, 2), tokenizer=tokenizer)

In [28]:
title_tfidf = title_tfv.fit_transform(titles)

In [29]:
sparse.save_npz('./text_features/train/title_tfidf_50000_pymorphy.npz', title_tfidf[:train_len])

In [30]:
sparse.save_npz('./text_features/test/title_tfidf_50000_pymorphy.npz', title_tfidf[train_len:])

In [31]:
with open('./text_features/title_tfidf_50000_pymorphy_vectorizer.pkl', 'wb') as f:
    pickle.dump(title_tfv, f)