# Loading libraries

In [200]:
import os
import string
import annoy
import numpy as np
import codecs
import matplotlib.pyplot as plt
import pandas as pd
import emoji
import pickle

from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
from gensim.models import Word2Vec

from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from tqdm.notebook import tqdm


# Loading data

## Products

In [165]:
products = pd.read_csv('/home/gor/Downloads/ProductsDataset.zip', compression='zip')

## Speaker data

In [None]:
question = None
written = False

with codecs.open("prepared_answers.txt","w", "utf-8") as fout:
    with codecs.open("Otvety.txt", "r", "utf-8") as fin:
        for line in tqdm(fin):
            if line.startswith("---"):
                written = False
                continue
            if not written and question is not None:
                fout.write(question.replace("\t", " ").strip() + "\t" + line.replace("\t", " "))
                written = True
                question = None
                continue
            if not written:
                question = line.strip()
                continue

In [None]:
# Downloading data for speaker into pd.DataFrame
speaker_data = pd.read_csv(
    '/home/gor/Downloads/prepared_answers.txt',
    sep='\t',
    error_bad_lines=False,
    names=['question', 'answer']
    )
# Drop NA - values
speaker_data.dropna(inplace=True)

# First step - creating classificator "speaker/product recommend"

## Preparing data for the modeling

In [167]:
morph = MorphAnalyzer()
stop_words = get_stop_words('ru')
punc_s = list(string.punctuation)

def preprocessing_text(text):
    """ Fuction returns a list of words' lemmas """
    
    text = emoji.replace_emoji(text, replace='')
    text = ''.join(i for i in text if i not in punc_s).split()
    text = [morph.parse(i.lower())[0].normal_form for i in text] 
    text = [i for i in text if i not in stop_words]
    return text

In [178]:
# Exctracting questions from speaker-df
questions_df = speaker_data.question.to_frame()
# Make a label for the next modeling
questions_df['label'] = 0

# Similar actions for the product-df
products_n_desc = pd.concat(
    [products.title, products.descrirption],
    ignore_index=True
)
products_n_desc = products_n_desc.to_frame(name='question')
products_n_desc['label'] = 1

# Concatenate both frames
common_data = pd.concat([questions_df, products_n_desc], axis=0, ignore_index=True, join='outer')
common_data.dropna(inplace=True)
# Normalization and lemmatization of questions and descriptions in common dataframe
common_data.question = common_data.question.apply(preprocessing_text)
# Exctracting questions and desctriptions from common dataframe in list form
to_vectorize =  common_data.question.tolist()
# Create w2v - model for further vectorization of text
w2v_common_model = Word2Vec(sentences=to_vectorize, min_count=1, vector_size=100)

In [202]:
common_data.to_csv('/home/gor/Desktop/Projects/common_data.csv', index=False)

In [11]:
#common_data = pd.read_csv('/home/gor/Downloads/common_data_v2.csv', dtype=list)
w2v_common_model = Word2Vec.load('/home/gor/Downloads/common_model_v2.model')
to_vectorize =  common_data.question.tolist()
vectorized_df = pd.read_csv('/home/gor/Downloads/vectorized_v2.csv')
vectorized_df['label'] = common_data['label']

In [None]:
common_data = pd.read_csv(
    '/home/gor/Downloads/common_data_v2.csv',
    names=['question', 'label'],
    dtype={'question':'str', 'label':'int'}
    )

In [312]:
def get_vector(line):
    """ Function returns vector of words
    which constits every line in dataframe """

    n2 = 0
    vector = np.zeros(100)

    for word in line:
        if word in w2v_common_model.wv:
            vector += w2v_common_model.wv[word]
            n2 += 1
        if n2 > 0:
            vector = vector / n2
        
    
    return vector 

In [None]:
vectorized_list = []

for i in to_vectorize:
    vectorized_list.append(get_vector(i))

vectorized_df = pd.DataFrame(vectorized_list)
vectorized_df['label'] = common_data['label']

In [203]:
vectorized_df.to_csv('/home/gor/Desktop/Projects/vectorized.csv', index=False)

## Modeling speaker/ product recommend classifier

In [13]:
X = vectorized_df.drop('label', axis=1)
y = vectorized_df.label

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print('Done')
print(f'Train shape - {X_train.shape}')
print(f'Test shape - {X_test.shape}')

Done
Train shape - (986824, 100)
Test shape - (246706, 100)


In [32]:
lr = LogisticRegression(solver='saga', penalty='elasticnet', warm_start=True, l1_ratio=0.5)
lr.fit(X_train, y_train)

In [None]:
predicted_test = lr.predict(X_test)
fpr, tpr, threshold = metrics.roc_curve(y_test, predicted_test)
roc_auc = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr, color='darkorange',
        label='ROC кривая (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-кривая')
plt.legend(loc="lower right")
plt.show()

# Second step - creating models for products recommend system and speaker

In [183]:
# Separate data from common data
preprocessed_speaker = common_data[common_data.label==0].question.reset_index(drop=True)
preprocessed_speaker.shape[0] == speaker_data.shape[0]

In [292]:
# Make product dataframe with product title and desctiprion which will
# be as one feature

with_titles = products.drop('descrirption', axis=1)
with_desc = products.drop('title', axis=1).rename(columns={'descrirption':'title'})
new_products = pd.concat([with_titles, with_desc])
new_products.dropna(inplace=True)
preprocessed_products = new_products.title.apply(preprocessing_text)

In [301]:
new_products.head()

Unnamed: 0,title,product_id,category_id,subcategory_id,properties,image_links
0,Юбка детская ORBY,58e3cfe6132ca50e053f5f82,22.0,2211,"{'detskie_razmer_rost': '81-86 (1,5 года)'}",http://cache3.youla.io/files/images/360_360/58...
1,Ботильоны,5667531b2b7f8d127d838c34,9.0,902,"{'zhenskaya_odezhda_tzvet': 'Зеленый', 'visota...",http://cache3.youla.io/files/images/360_360/5b...
2,Брюки,59534826aaab284cba337e06,9.0,906,{'zhenskaya_odezhda_dzhinsy_bryuki_tip': 'Брюк...,http://cache3.youla.io/files/images/360_360/59...
3,Продам детские шапки,57de544096ad842e26de8027,22.0,2217,"{'detskie_pol': 'Девочкам', 'detskaya_odezhda_...",http://cache3.youla.io/files/images/360_360/57...
4,Блузка,5ad4d2626c86cb168d212022,9.0,907,"{'zhenskaya_odezhda_tzvet': 'Синий', 'zhenskay...",http://cache3.youla.io/files/images/360_360/5a...


## Product and speaker indices

In [308]:
product_model = Word2Vec(sentences=preprocessed_products, min_count=1, vector_size=100)
speaker_model = Word2Vec(sentences=preprocessed_speaker, min_count=1, vector_size=100)

In [294]:
def make_index(df, model, vector_len=100, par='angular', n_trees=10):
    """ Function return annoy-index from input dataframe/series"""

    new_index = annoy.AnnoyIndex(vector_len, par)
    counter = 0

    for line in tqdm(df):
        n_w2v = 0
        vector = np.zeros(vector_len)
        for word in line:
            if word in model.wv:
                vector += model.wv[word]
                n_w2v += 1
        if n_w2v>0:
            vector = vector / n_w2v
        
        new_index.add_item(counter, vector)
        counter += 1

    new_index.build(n_trees)
    return new_index
    
product_index = make_index(df=preprocessed_products, model=product_model)
speaker_index = make_index(df=preprocessed_speaker, model=speaker_model)

  0%|          | 0/69064 [00:00<?, ?it/s]

In [295]:
#speaker_model.save('/home/gor/Desktop/Projects/Speaker_recommend_bot/speaker_model.model')
#product_model.save('/home/gor/Desktop/Projects/Speaker_recommend_bot/product_model.model')
#product_index.save('/home/gor/Desktop/Projects/Speaker_recommend_bot/product_index.ann')
#speaker_index.save('/home/gor/Desktop/Projects/Speaker_recommend_bot/speaker_index.ann')

True

## Function for answers

In [296]:
def classifier(inp_data):
    """ Function returns class-value for input data
    0 - speaker, 1 - product """
    
    inp_data = preprocessing_text(inp_data)
    data_vector = get_vector(inp_data)
    data_vector = data_vector.reshape(1,-1)
    return lr.predict(data_vector)[0]

In [327]:
new_products.head()

Unnamed: 0,title,product_id,category_id,subcategory_id,properties,image_links
0,Юбка детская ORBY,58e3cfe6132ca50e053f5f82,22.0,2211,"{'detskie_razmer_rost': '81-86 (1,5 года)'}",http://cache3.youla.io/files/images/360_360/58...
1,Ботильоны,5667531b2b7f8d127d838c34,9.0,902,"{'zhenskaya_odezhda_tzvet': 'Зеленый', 'visota...",http://cache3.youla.io/files/images/360_360/5b...
2,Брюки,59534826aaab284cba337e06,9.0,906,{'zhenskaya_odezhda_dzhinsy_bryuki_tip': 'Брюк...,http://cache3.youla.io/files/images/360_360/59...
3,Продам детские шапки,57de544096ad842e26de8027,22.0,2217,"{'detskie_pol': 'Девочкам', 'detskaya_odezhda_...",http://cache3.youla.io/files/images/360_360/57...
4,Блузка,5ad4d2626c86cb168d212022,9.0,907,"{'zhenskaya_odezhda_tzvet': 'Синий', 'zhenskay...",http://cache3.youla.io/files/images/360_360/5a...


In [331]:
def get_answer(question, model=product_model, index=product_index, path=new_products['product_id']):

    if classifier(question)==0:
        model = speaker_model
        index = speaker_index
        path = speaker_data['answer']

    preprocessed_question = preprocessing_text(question)

    n_w2v = 0
    vector = np.zeros(100)

    for word in preprocessed_question:
        if word in model.wv:
            vector += model.wv[word]
            n_w2v += 1
    if n_w2v > 0:
        vector = vector / n_w2v
    answer_index = index.get_nns_by_vector(vector, 1)

    print(path.iloc[answer_index[0]])

In [334]:
get_answer('юбка orby')

1
58cd08378ae74be99ef57dfd




In [335]:
get_answer('где ключи от танка')

0
Як. 


