трейн https://www.dropbox.com/s/nlp7ltvajkssbom/train_data.csv.zip?dl=0

тест https://www.dropbox.com/s/j6ya15gaa6mayth/stud_test_data_leak.csv.zip?dl=0

w2v моделька https://www.dropbox.com/s/9qlyc59gq4ozlqo/model.w2v.zip?dl=0

In [None]:
import re
from functools import lru_cache
from multiprocessing import Pool

import numpy as np
import pandas as pd
import pymorphy2

from sklearn.model_selection import StratifiedKFold

In [None]:
data = pd.read_csv('../data/dataset_sample.csv')

In [None]:
data.head()

In [None]:
data['new_targ'] = data.new_targ.astype('category').cat.codes

In [None]:
data.head()

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=12312)

In [None]:
X = data[['title', 'description', 'price']]
y = data.new_targ

for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    break

In [None]:
train_data = data.loc[train_index]
test_data = data.loc[test_index]

_______

In [None]:
train_data.new_targ.value_counts(normalize=True)

In [None]:
test_data.new_targ.value_counts(normalize=True)

________-

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
import scipy.sparse as sp
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

##### CountVectorizer

In [None]:
%%time

vectorizer = CountVectorizer(ngram_range=(1,1), binary=True)
X_train = vectorizer.fit_transform(train_data['title'] + ' ' + train_data['description'])
X_test = vectorizer.transform(test_data['title'] + ' ' + test_data['description'])

In [None]:
X_train.shape

In [None]:
lr = LogisticRegression(solver='lbfgs')

In [None]:
%%time

lr.fit(X_train, train_data.new_targ)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
accuracy_score(test_data.new_targ, y_pred)

##### CountVectorizer

In [None]:
vectorizer = CountVectorizer(max_features=1500, ngram_range=(1,2), binary=True)
X_train = vectorizer.fit_transform(train_data['title'] + ' ' + train_data['description'])
X_test = vectorizer.transform(test_data['title'] + ' ' + test_data['description'])

In [None]:
X_train.shape

In [None]:
lr = LogisticRegression(solver='lbfgs')

In [None]:
%%time
lr.fit(X_train, train_data.new_targ)

In [None]:
y_pred = lr.predict(X_test)
accuracy_score(test_data.new_targ, y_pred)

##### TfidfVectorizer

In [None]:
%%time
vectorizer = TfidfVectorizer(max_features=1500, ngram_range=(1,2))
X_train = vectorizer.fit_transform(train_data['title'] + ' ' + train_data['description'])
X_test = vectorizer.transform(test_data['title'] + ' ' + test_data['description'])

In [None]:
X_test.shape

In [None]:
lr = LogisticRegression(solver='lbfgs')

In [None]:
%%time
lr.fit(X_train, train_data.new_targ)

In [None]:
y_pred = lr.predict(X_test)
accuracy_score(test_data.new_targ, y_pred)

##### HashingVectorizer

In [None]:
%%time
vectorizer = HashingVectorizer(n_features=50912, ngram_range=(1,1))
X_train = vectorizer.fit_transform(train_data['title'] + ' ' + train_data['description'])
X_test = vectorizer.transform(test_data['title'] + ' ' + test_data['description'])

In [None]:
X_test.shape

In [None]:
lr = LogisticRegression(solver='lbfgs')

In [None]:
%%time
lr.fit(X_train, train_data.new_targ)

In [None]:
y_pred = lr.predict(X_test)
accuracy_score(test_data.new_targ, y_pred)

##### приведение слов к начнальной форме

In [None]:
morph = pymorphy2.MorphAnalyzer()

@lru_cache(maxsize=100000)
def get_normal_form (i):
    return morph.normal_forms(i)[0]

def normalize_text(x):
    x = x[:200] 
    return ' '.join([get_normal_form(i) for i in re.findall('\w+', x)])

In [None]:
normalize_text('Стулья из прессованной кожи 23432 руб.')

In [None]:
# TODO - лучше выкидывать стопслова: предлоги, сильно частотные слова (продам, итд)

In [None]:
test_data.head().title.apply(lambda x: normalize_text(x))

In [None]:
%%time
with Pool(processes=4) as pool:
    train_data['normalized_text'] = pool.map(normalize_text, train_data['title'] + ' ' + train_data['description'])
    pool.terminate()

In [None]:
%%time
with Pool(processes=4) as pool:
    test_data['normalized_text'] = pool.map(normalize_text, test_data['title'] + ' ' + test_data['description'])
    pool.terminate()

In [None]:
vectorizer = CountVectorizer(max_features=1500, ngram_range=(1,2), binary=True)
X_train = vectorizer.fit_transform(train_data['normalized_text'])
X_test = vectorizer.transform(test_data['normalized_text'])

In [None]:
X_train.shape

In [None]:
lr = LogisticRegression(solver='lbfgs')

In [None]:
%%time
lr.fit(X_train, train_data.new_targ)

In [None]:
y_pred = lr.predict(X_test)
accuracy_score(test_data.new_targ, y_pred)

##### Word2Vec

In [None]:
words = {
    'Москва': [0.4542, 0.7123, 0.2745],
    'Уфа': [0.4, -0.33, 0.844],
    'АкБарс': [0.42, -0.34, -0.234],
    'Собянин': [0.4673, 0.7, 0.2245],
}

In [None]:
np.array(words['Москва']).dot(np.array(words['Собянин']))

In [None]:
from gensim.models import word2vec

In [None]:
from gensim.models import fasttext

In [None]:
a = train_data['normalized_text'].str.split()

In [None]:
%%time

model = word2vec.Word2Vec(a, size=50, window=3, workers=2)
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

In [None]:
model.save('./model.w2v')

In [None]:
w2v['iphone']

In [None]:
model.most_similar('iphone')

In [None]:
model.most_similar('картина')

In [None]:
_ = train_data['normalized_text'].head(2).str.split().values

In [None]:
l = []
for i in _:
    vec = np.zeros(50)
    for j in i:
        if j in model:
            vec += model[j]
    l.append(vec)

In [None]:
def get_vector(dataset, model):
    vecs = dataset.str.split().values
    # -----
    l = []
    for i in vecs:
        vec = np.zeros(50)
        for j in i:
            if j in model:
                vec += model[j]
        l.append(vec)
    l = np.array(l)
    # -----
    return l

In [None]:
%%time

train_vec = get_vector(train_data['normalized_text'], model)

In [None]:
train_vec.shape

In [None]:
%%time

test_vec = get_vector(test_data['normalized_text'], model)

In [None]:
lr = LogisticRegression(solver='lbfgs')

In [None]:
%%time
lr.fit(train_vec, train_data.new_targ)

In [None]:
y_pred = lr.predict(test_vec)
accuracy_score(test_data.new_targ, y_pred)

> TODO сделать лучше и прислать ответ

##### fasstext

чуть лучше чем word2vec

In [None]:
word = 'картина'
[word[i:i+3] for i in range(len(word) - 2)]