In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
sns.set_style('ticks')

In [2]:
from nltk import ngrams
from collections import defaultdict
from gensim.models import Word2Vec

In [3]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LinearRegression

In [4]:
# data import
train = pd.read_excel('semifinal_data/train.xlsx')
test = pd.read_excel('semifinal_data/test.xlsx')
descr = pd.read_excel('semifinal_data/description.xlsx')

In [5]:
# word info gathering
train['Full_descr'] = train['Shrt_Desc'].values + ' ' +\
    train['GmWt_Desc1'].fillna('').values + ' ' +\
    train['GmWt_Desc2'].fillna('').values

train = train.drop(['Shrt_Desc', 'GmWt_Desc1', 'GmWt_Desc2'], axis = 1)

test['Full_descr'] = test['Shrt_Desc'].values + ' ' +\
    test['GmWt_Desc1'].fillna('').values + ' ' +\
    test['GmWt_Desc2'].fillna('').values

test = test.drop(['Shrt_Desc', 'GmWt_Desc1', 'GmWt_Desc2'], axis = 1)

In [6]:
def prepare_description(description):
    word_form = description.lower().split()
        
    res = []
    for i in word_form:
        res += i.split(',')
        
    for i in range(len(res)):
        try_to_find = res[i].find('w/')
        if (try_to_find != -1):
            res[i] = res[i][:try_to_find] + res[i][try_to_find + 2:]
        
    return res

# lowering and deleting 'w/' substrings
word_information = train['Full_descr'].apply(prepare_description)

word_information_test = test['Full_descr'].apply(prepare_description)

In [7]:
text = []
for i in range(1, 14):
    with open(".\semifinal_files\\book_{}.txt".format(i), 'r') as file:
        text += [eval(file.read())]

# lowering and getting rid of short useless words, 
# which sometimes appear in this dataset
for i in range(len(text)):
    j = 0
    while j < len(text[i]):
        text[i][j] = text[i][j].lower()
        
        if len(text[i][j]) == 1:
            text[i].pop(j)
            j -= 1
        j += 1

In [10]:
cook_book = Word2Vec(text, min_count=1, size=15, workers=5,
                 window=4, sg = 2, iter = 5)

In [11]:
all_words = set()
for item in cook_book.wv.vocab:
    all_words.add(item)

Каждому типу продукта сопоставим слова из доступного словаря:\
1) Хлебобулочные изделия : 'bakery', 'bread', 'loaf', 'bun'\
2) Жидкость : 'liquid', 'water', 'juice', 'wine', 'cocktail'\
3) Молочная продукция : 'milk', 'dairy', 'yogurt', 'buttermilk'\
4) Мясная продукция : 'meat', 'lamb', 'pork', 'mutton'\
5) Овощи / фрукты : 'fruit', 'vegetable', 'apple', 'banana', 'tomato', 'cucumber'\

In [12]:
def get_voting(description, key_words, min_confidence = 0.1):
    votes = 0
    sum_confidence = 0
    for word in description:
        if word in all_words:
            for key_word in key_words:
                sim = cook_book.wv.similarity(word, key_word)
                if sim >= min_confidence: 
                    sum_confidence += sim
                    votes += 1
    if not votes:
        return 0
    return sum_confidence / votes

In [14]:
label = [[]]*6
label[1] = ['bakery', 'bread', 'loaf', 'bun', 'sugar']
label[2] = ['liquid', 'water', 'juice', 'wine', 'cocktail']
label[3] = ['milk', 'dairy', 'yogurt', 'buttermilk']
label[4] = ['meat', 'lamb', 'pork', 'mutton', 'bone', 'raw']
label[5] = ['fruit', 'vegetable', 'salad', 'exotic']

for i in range(1, 6):
    train['label_{}'.format(i)] = word_information.apply(lambda x : get_voting(x, label[i], 0.2))
    
for i in range(1, 6):
    test['label_{}'.format(i)] = word_information_test.apply(lambda x : get_voting(x, label[i], 0.2))

In [15]:
label_columns = ['label_{}'.format(i) for i in range(1, 6)]

In [16]:
train_res = train[label_columns].apply(lambda x : x.argmax(), axis=1)

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  """Entry point for launching an IPython kernel.


In [17]:
test_res = test[label_columns].apply(lambda x : x.argmax(), axis=1)

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  """Entry point for launching an IPython kernel.


In [19]:
test_res.to_csv('Pred_extra_1.csv')

  """Entry point for launching an IPython kernel.
