In [1]:
import tensorflow as tf
import pandas as pd
import matplotlib
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import keras as K
from sklearn.base import BaseEstimator
import gc
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from scipy.spatial.distance import euclidean, cosine

%matplotlib inline  

Using TensorFlow backend.


### Define Constants

In [20]:
BATCH_SIZE = 1
VEC_SIMILARITY = 10

### Load Data

In [28]:
with open('resources/rusvectors/model.txt') as f:
    lines = f.readlines()
    embeddings = []
    word_to_index = {}
    index_to_word = {}
    for index, line in enumerate(lines[1:]):
        values = line.split(' ')
        word = values[0].split('_')[0]
        embedding = np.array(values[1:]).astype('float64')
        embeddings.append(embedding)
        word_to_index[word] = index
        index_to_word[index] = word
    embeddings = np.asarray(embeddings)
    

In [29]:
d = pd.DataFrame(word_to_index.values(), columns=['index'], index=word_to_index.keys())
d.to_csv('resources/word_to_index.csv')
d = pd.DataFrame(index_to_word.values(), columns=['word'], index=index_to_word.keys())
d.to_csv('resources/index_to_word.csv')
np.save('resources/embeddings.npy', embeddings)

In [3]:
embeddings = np.load('resources/embeddings.npy')
word_to_index = pd.read_csv('resources/word_to_index.csv', index_col=0)
index_to_word = pd.read_csv('resources/index_to_word.csv', index_col=0)

In [36]:
word_to_index.loc['филе']

index    42087
Name: филе, dtype: int64

In [4]:
def word2vec(word):
    try:
        index = word_to_index.loc[word]
        return embeddings[index]
    except:
        return np.random.randn(1, 300)

In [5]:
groceries = pd.read_csv('resources/groceries_rus.csv')
groceries.columns

Index(['Date', 'Food'], dtype='object')

In [6]:
stop_words = stopwords.words('russian')

In [21]:
def merge_vecs(vecs, threshold=VEC_SIMILARITY):
    merge_happened = False
    i = 0
    while i < len(vecs) - 1:
        j = i + 1
        while j < len(vecs):
            if euclidean(vecs[i], vecs[j]) < VEC_SIMILARITY:
                vecs[i] = np.mean(vecs[[i, j]], axis=0)
                vecs = np.delete(vecs, i, axis=0)
                merge_happened = True
            j += 1
        i += 1
                
    if merge_happened:
        vecs = merge_vecs(vecs, threshold)
    return vecs

In [25]:
def arg_max_word_match(vecs, new_vec, threshold=VEC_SIMILARITY):
    if len(vecs) == 0:
        return None
    distances = [euclidean(vec, new_vec) for vec in vecs]
    sorted = np.argsort(distances)
    if distances[sorted[0]] < threshold:
        return sorted[0]

In [31]:
def vec2word(vec, threshold=VEC_SIMILARITY):
    embedding = np.fromstring(vec)
    max_match_index = arg_max_word_match(embeddings, embedding, threshold)
    word = index_to_word.iloc[max_match_index]
    return word

In [27]:
groceries_encoded = {}
for index, row in groceries.iterrows():
    date = row.Date
    items = row.Food.lower().split(', ')
    def preprocess(item):
        words = item.split(' ')
        words = [i for i in words if (i not in stop_words)]
        vecs = np.asarray([word2vec(w) for w in words])
        vec = np.mean(vecs, axis=0)
        return vec
    vecs = np.asarray([preprocess(i) for i in items])
    vecs = np.squeeze(vecs, axis=1)
    vecs = merge_vecs(vecs)
    for vec in vecs:
        groceries_vecs = [np.fromstring(k) for k in groceries_encoded.keys()]
        max_match_index = arg_max_word_match(groceries_vecs, vec)
        if max_match_index is not None:
            max_match = groceries_vecs[max_match_index]
            t = groceries_encoded[max_match.tostring()]
            new_vec = np.mean([vec, max_match], axis=0)
            del groceries_encoded[max_match.tostring()]
            groceries_encoded[new_vec.tostring()] = t + [date]
        else:
            groceries_encoded[vec.tostring()] = [date]
                

  from ipykernel import kernelapp as app


In [39]:
embedding_str = list(groceries_encoded.keys())[9]
vec2word(embedding_str, 10)

  


None <class 'NoneType'>


TypeError: Cannot index by location index with a non-integer key

In [29]:
len(groceries_encoded)

70

In [None]:
groceries = pd.DataFrame(columns=total_groceries)
groceries_encoded = groceries.copy()
for row in groceries.iterrows():
    for line in row.items():
        words = line.split(' ')
        embedding = np.sum([word2vec(word) for word in words])
        groceries_encoded

### Generate Training data

In [62]:
valid_length = groceries_history_binary.shape[0] - (groceries_history_binary.shape[0] % BATCH_SIZE)

Y = groceries_history_binary.values[:valid_length]
X = np.random.uniform(1, 5, (Y.shape[0],)).astype('int8')
X = np.expand_dims(X, -1)
X = np.expand_dims(X, -1)

### Build Model

In [6]:
class RecommendingModel(BaseEstimator):
    
    def __init__(self, window=3):
        self.last_tendency = None
        self.window = window
    
    def fit(self, x, y=None):
        assert x.shape[1] == 1
        self.last_tendency = np.mean(x[-self.window:] if x.shape[0] > self.window else x)
        
    def predict(self, x):
        return x > self.last_tendency
        

In [86]:
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

In [88]:
model.fit(X, Y, BATCH_SIZE, 1, callbacks=[checkpoint])

Epoch 1/1
   1/9728 [..............................] - ETA: 2:04 - loss: 0.0963 - acc: 0.9765

  14/9728 [..............................] - ETA: 44s - loss: 0.0892 - acc: 0.9786 

  26/9728 [..............................] - ETA: 44s - loss: 0.0892 - acc: 0.9776

  38/9728 [..............................] - ETA: 43s - loss: 0.0977 - acc: 0.9757

  50/9728 [..............................] - ETA: 42s - loss: 0.0992 - acc: 0.9756

  63/9728 [..............................] - ETA: 41s - loss: 0.0956 - acc: 0.9764

  72/9728 [..............................] - ETA: 43s - loss: 0.0997 - acc: 0.9753

  81/9728 [..............................] - ETA: 44s - loss: 0.1022 - acc: 0.9746

  90/9728 [..............................] - ETA: 45s - loss: 0.1072 - acc: 0.9731

 104/9728 [..............................] - ETA: 43s - loss: 0.1090 - acc: 0.9723

 119/9728 [..............................] - ETA: 42s - loss: 0.1066 - acc: 0.9730

 128/9728 [..............................] - ETA: 43s - loss: 0.1035 - acc: 0.9740

 141/9728 [..............................] - ETA: 43s - loss: 0.1028 - acc: 0.9740

 154/9728 [..............................] - ETA: 42s - loss: 0.1021 - acc: 0.9743

 167/9728 [..............................] - ETA: 42s - loss: 0.1039 - acc: 0.9737

 179/9728 [..............................] - ETA: 42s - loss: 0.1044 - acc: 0.9735

 187/9728 [..............................] - ETA: 43s - loss: 0.1038 - acc: 0.9736

 199/9728 [..............................] - ETA: 42s - loss: 0.1030 - acc: 0.9738

 211/9728 [..............................] - ETA: 42s - loss: 0.1035 - acc: 0.9737

 223/9728 [..............................] - ETA: 42s - loss: 0.1037 - acc: 0.9736

 237/9728 [..............................] - ETA: 42s - loss: 0.1033 - acc: 0.9738

 249/9728 [..............................] - ETA: 41s - loss: 0.1042 - acc: 0.9736

 262/9728 [..............................] - ETA: 41s - loss: 0.1048 - acc: 0.9733

 274/9728 [..............................] - ETA: 41s - loss: 0.1049 - acc: 0.9733

 287/9728 [..............................] - ETA: 41s - loss: 0.1057 - acc: 0.9729

 300/9728 [..............................] - ETA: 41s - loss: 0.1066 - acc: 0.9727

 312/9728 [..............................] - ETA: 41s - loss: 0.1057 - acc: 0.9730

 325/9728 [>.............................] - ETA: 40s - loss: 0.1053 - acc: 0.9731

 337/9728 [>.............................] - ETA: 40s - loss: 0.1053 - acc: 0.9731

 350/9728 [>.............................] - ETA: 40s - loss: 0.1061 - acc: 0.9729

 365/9728 [>.............................] - ETA: 40s - loss: 0.1061 - acc: 0.9729

 378/9728 [>.............................] - ETA: 39s - loss: 0.1054 - acc: 0.9731

 392/9728 [>.............................] - ETA: 39s - loss: 0.1053 - acc: 0.9730

 407/9728 [>.............................] - ETA: 39s - loss: 0.1049 - acc: 0.9731

 421/9728 [>.............................] - ETA: 39s - loss: 0.1054 - acc: 0.9729

 435/9728 [>.............................] - ETA: 38s - loss: 0.1059 - acc: 0.9727

 450/9728 [>.............................] - ETA: 38s - loss: 0.1059 - acc: 0.9726

 467/9728 [>.............................] - ETA: 38s - loss: 0.1061 - acc: 0.9726

 483/9728 [>.............................] - ETA: 37s - loss: 0.1070 - acc: 0.9723

 500/9728 [>.............................] - ETA: 37s - loss: 0.1070 - acc: 0.9722

 517/9728 [>.............................] - ETA: 37s - loss: 0.1066 - acc: 0.9724

 534/9728 [>.............................] - ETA: 36s - loss: 0.1056 - acc: 0.9727

 551/9728 [>.............................] - ETA: 36s - loss: 0.1050 - acc: 0.9729

 567/9728 [>.............................] - ETA: 36s - loss: 0.1045 - acc: 0.9730

 580/9728 [>.............................]

 - ETA: 36s - loss: 0.1040 - acc: 0.9732



 594/9728 [>.............................] - ETA: 35s - loss: 0.1040 - acc: 0.9732

 608/9728 [>.............................] - ETA: 35s - loss: 0.1038 - acc: 0.9733

 625/9728 [>.............................] - ETA: 35s - loss: 0.1031 - acc: 0.9735

 641/9728 [>.............................] - ETA: 35s - loss: 0.1032 - acc: 0.9734

 658/9728 [=>............................] - ETA: 35s - loss: 0.1029 - acc: 0.9734

 674/9728 [=>............................] - ETA: 34s - loss: 0.1034 - acc: 0.9732

 689/9728 [=>............................] - ETA: 34s - loss: 0.1036 - acc: 0.9731

 706/9728 [=>............................] - ETA: 34s - loss: 0.1039 - acc: 0.9730



 723/9728 [=>............................] - ETA: 34s - loss: 0.1037 - acc: 0.9731

 738/9728 [=>............................] - ETA: 34s - loss: 0.1035 - acc: 0.9731

 754/9728 [=>............................] - ETA: 33s - loss: 0.1032 - acc: 0.9732

 769/9728 [=>............................] - ETA: 33s - loss: 0.1029 - acc: 0.9733

 784/9728 [=>............................] - ETA: 33s - loss: 0.1026 - acc: 0.9734

 800/9728 [=>............................] - ETA: 33s - loss: 0.1026 - acc: 0.9734

 815/9728 [=>............................] - ETA: 33s - loss: 0.1027 - acc: 0.9734

 831/9728 [=>............................] - ETA: 33s - loss: 0.1027 - acc: 0.9734

 847/9728 [=>............................] - ETA: 33s - loss: 0.1028 - acc: 0.9734

 862/9728 [=>............................] - ETA: 32s - loss: 0.1023 - acc: 0.9735

 879/9728 [=>............................] - ETA: 32s - loss: 0.1028 - acc: 0.9733

 896/9728 [=>............................] - ETA: 32s - loss: 0.1021 - acc: 0.9736

 914/9728 [=>............................] - ETA: 32s - loss: 0.1022 - acc: 0.9736

 930/9728 [=>............................] - ETA: 32s - loss: 0.1017 - acc: 0.9737

 946/9728 [=>............................] - ETA: 32s - loss: 0.1024 - acc: 0.9735

 964/9728 [=>............................] - ETA: 31s - loss: 0.1023 - acc: 0.9735

 981/9728 [==>...........................] - ETA: 31s - loss: 0.1023 - acc: 0.9735



 998/9728 [==>...........................] - ETA: 31s - loss: 0.1022 - acc: 0.9736

1014/9728 [==>...........................] - ETA: 31s - loss: 0.1019 - acc: 0.9737

1031/9728 [==>...........................] - ETA: 31s - loss: 0.1016 - acc: 0.9738

1048/9728 [==>...........................] - ETA: 31s - loss: 0.1013 - acc: 0.9739

1066/9728 [==>...........................] - ETA: 31s - loss: 0.1012 - acc: 0.9739

1084/9728 [==>...........................] - ETA: 30s - loss: 0.1013 - acc: 0.9739

1102/9728 [==>...........................] - ETA: 30s - loss: 0.1008 - acc: 0.9741

1120/9728 [==>...........................] - ETA: 30s - loss: 0.1008 - acc: 0.9741

1138/9728 [==>...........................] - ETA: 30s - loss: 0.1011 - acc: 0.9740

1156/9728 [==>...........................] - ETA: 30s - loss: 0.1013 - acc: 0.9739

1174/9728 [==>...........................] - ETA: 30s - loss: 0.1015 - acc: 0.9739

1192/9728 [==>...........................] - ETA: 29s - loss: 0.1018 - acc: 0.9738

1210/9728 [==>...........................] - ETA: 29s - loss: 0.1015 - acc: 0.9739

1228/9728 [==>...........................] - ETA: 29s - loss: 0.1015 - acc: 0.9739



1246/9728 [==>...........................] - ETA: 29s - loss: 0.1016 - acc: 0.9739

1263/9728 [==>...........................] - ETA: 29s - loss: 0.1015 - acc: 0.9739

1281/9728 [==>...........................] - ETA: 29s - loss: 0.1018 - acc: 0.9738

1300/9728 [===>..........................] - ETA: 29s - loss: 0.1015 - acc: 0.9739

1318/9728 [===>..........................] - ETA: 28s - loss: 0.1014 - acc: 0.9739

1336/9728 [===>..........................] - ETA: 28s - loss: 0.1013 - acc: 0.9739

1354/9728 [===>..........................] - ETA: 28s - loss: 0.1010 - acc: 0.9740

1372/9728 [===>..........................] - ETA: 28s - loss: 0.1009 - acc: 0.9740

1390/9728 [===>..........................] - ETA: 28s - loss: 0.1006 - acc: 0.9741

1408/9728 [===>..........................] - ETA: 28s - loss: 0.1005 - acc: 0.9742

1427/9728 [===>..........................] - ETA: 28s - loss: 0.1006 - acc: 0.9741

1445/9728 [===>..........................] - ETA: 28s - loss: 0.1006 - acc: 0.9741

1463/9728 [===>..........................] - ETA: 27s - loss: 0.1001 - acc: 0.9743

1481/9728 [===>..........................] - ETA: 27s - loss: 0.1000 - acc: 0.9743

1499/9728 [===>..........................] - ETA: 27s - loss: 0.1001 - acc: 0.9743

1517/9728 [===>..........................] - ETA: 27s - loss: 0.1004 - acc: 0.9742

1536/9728 [===>..........................] - ETA: 27s - loss: 0.1006 - acc: 0.9741

1554/9728 [===>..........................] - ETA: 27s - loss: 0.1006 - acc: 0.9741

1572/9728 [===>..........................] - ETA: 27s - loss: 0.1007 - acc: 0.9741

1590/9728 [===>..........................] - ETA: 27s - loss: 0.1009 - acc: 0.9741

1609/9728 [===>..........................] - ETA: 27s - loss: 0.1010 - acc: 0.9740

1627/9728 [====>.........................] - ETA: 26s - loss: 0.1012 - acc: 0.9740

1645/9728 [====>.........................] - ETA: 26s - loss: 0.1011 - acc: 0.9740

1663/9728 [====>.........................] - ETA: 26s - loss: 0.1010 - acc: 0.9740

1681/9728 [====>.........................] - ETA: 26s - loss: 0.1008 - acc: 0.9741

1699/9728 [====>.........................] - ETA: 26s - loss: 0.1007 - acc: 0.9741

1717/9728 [====>.........................] - ETA: 26s - loss: 0.1006 - acc: 0.9742

1736/9728 [====>.........................] - ETA: 26s - loss: 0.1004 - acc: 0.9742

1756/9728 [====>.........................] - ETA: 26s - loss: 0.1003 - acc: 0.9742

1775/9728 [====>.........................] - ETA: 26s - loss: 0.1002 - acc: 0.9743

1793/9728 [====>.........................] - ETA: 25s - loss: 0.1002 - acc: 0.9743

1812/9728 [====>.........................] - ETA: 25s - loss: 0.1000 - acc: 0.9743

1831/9728 [====>.........................] - ETA: 25s - loss: 0.1000 - acc: 0.9743

1850/9728 [====>.........................] - ETA: 25s - loss: 0.0999 - acc: 0.9744

1868/9728 [====>.........................] - ETA: 25s - loss: 0.0999 - acc: 0.9744

1886/9728 [====>.........................] - ETA: 25s - loss: 0.0997 - acc: 0.9744

1904/9728 [====>.........................] - ETA: 25s - loss: 0.0997 - acc: 0.9745

1922/9728 [====>.........................] - ETA: 25s - loss: 0.1000 - acc: 0.9744

1940/9728 [====>.........................] - ETA: 25s - loss: 0.0999 - acc: 0.9744

1958/9728 [=====>........................] - ETA: 25s - loss: 0.0998 - acc: 0.9744

1976/9728 [=====>........................] - ETA: 25s - loss: 0.0998 - acc: 0.9744

1995/9728 [=====>........................] - ETA: 24s - loss: 0.0997 - acc: 0.9745

2013/9728 [=====>........................] - ETA: 24s - loss: 0.0999 - acc: 0.9744

2031/9728 [=====>........................] - ETA: 24s - loss: 0.0998 - acc: 0.9744

2049/9728 [=====>........................] - ETA: 24s - loss: 0.0996 - acc: 0.9745

2067/9728 [=====>........................] - ETA: 24s - loss: 0.0996 - acc: 0.9745

2085/9728 [=====>........................] - ETA: 24s - loss: 0.0995 - acc: 0.9745

2103/9728 [=====>........................] - ETA: 24s - loss: 0.0993 - acc: 0.9746

2120/9728 [=====>........................] - ETA: 24s - loss: 0.0993 - acc: 0.9746

2138/9728 [=====>........................] - ETA: 24s - loss: 0.0994 - acc: 0.9745

2156/9728 [=====>........................] - ETA: 24s - loss: 0.0995 - acc: 0.9745

2174/9728 [=====>........................] - ETA: 24s - loss: 0.0993 - acc: 0.9746

2192/9728 [=====>........................] - ETA: 24s - loss: 0.0994 - acc: 0.9746

2210/9728 [=====>........................] - ETA: 23s - loss: 0.0992 - acc: 0.9746

2228/9728 [=====>........................] - ETA: 23s - loss: 0.0992 - acc: 0.9746

2246/9728 [=====>........................] - ETA: 23s - loss: 0.0994 - acc: 0.9746

2264/9728 [=====>........................] - ETA: 23s - loss: 0.0994 - acc: 0.9746





















































































































































































































































































































































































































































































































































































































































































































































































































































<tensorflow.python.keras.callbacks.History at 0x135a44668>

### Assessment

In [99]:
def generate_purchase_list(model, encoder, period=2):
    x = np.ones((BATCH_SIZE, 1, 1)) * period
    probs = model.predict(x)
    thresh = probs > 0.7
    thresh = np.squeeze(thresh)
    indices = np.argwhere(thresh == 1)
    decoded_groceries = encoder.inverse_transform(indices)
    return decoded_groceries

In [124]:
generate_purchase_list(model, grocery_encoder, 4)

array(['bottled beer', 'bottled water', 'brandy', 'chicken', 'chocolate',
       'coffee', 'condensed milk', 'cream cheese ', 'finished products',
       'frozen chicken', 'frozen dessert', 'frozen vegetables',
       'fruit/vegetable juice', 'hard cheese', 'hygiene articles',
       'ice cream', 'instant coffee', 'jam', 'liquor (appetizer)',
       'misc. beverages', 'onions', 'packaged fruit/vegetables', 'pasta',
       'pastry', 'pot plants', 'red/blush wine', 'root vegetables', 'rum',
       'softener', 'specialty bar', 'white bread', 'white wine',
       'whole milk', 'yogurt', 'zwieback'], dtype='<U25')