In [4]:
from keras.models import Sequential
from keras.layers.core import Dense
from keras.optimizers import SGD

In [5]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import scipy as sc

In [6]:
df = pd.read_csv("./../../train_df.csv", index_col=0)


In [7]:
class BagOfWordCoder():
    vectorizer = TfidfVectorizer()
    def __init__(self, path_to_corpus):
        if not path_to_corpus.endswith(".vec"):
            raise Exception(f"Need to be *.vec file")
        with open(path_to_corpus, "rb") as f:
            self.vectorizer = pickle.load(f)
    
    def vectorize(self, text):
        return self.vectorizer.transform(text)
    
    def get_text(self, matrix):
        return self.vectorizer.inverse_transform(matrix)
    

In [8]:
coder = BagOfWordCoder("./../vectorizer.vec")

In [9]:
len(coder.vectorizer.vocabulary_.keys())

40397

In [10]:
dict(zip(df["position"].unique(), list(range(3))))

{'Официант': 0, 'Кладовщик': 1, 'Водитель погрузчика': 2}

In [11]:
df["position"] = df["position"].map(dict(zip(df["position"].unique(), list(range(3)))))

In [12]:
def non_shuffling_train_test_split(X, y, test_size=0.2):
    X = X.iloc[::-1]
    y = y[::-1]
    i = int((1 - test_size) * X.shape[0]) + 1
    X_train, X_test = np.split(X, [i])
    y_train, y_test = np.split(y, [i])
    return X_train, X_test, y_train, y_test

In [13]:
x_train, x_test, y_train, y_test = non_shuffling_train_test_split(df.drop(['position'], axis=1), df['position'])

In [14]:
x_test = coder.vectorize(x_test['text'].values)
x_train = coder.vectorize(x_train['text'].values)

In [15]:
type(x_test.indices)

numpy.ndarray

In [16]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [17]:
y_test = np.reshape(y_test, (-1,1))
y_train = np.reshape(y_train, (-1,1))


In [18]:
y_train = sc.sparse.csr_matrix(y_train, dtype='float64')
y_test = sc.sparse.csr_matrix(y_test, dtype='float64')

In [19]:
y_test = y_test.todense()
y_train = y_train.todense()
x_test = x_test.todense()
x_train = x_train.todense()

In [20]:
y_test

matrix([[2.],
        [2.],
        [2.],
        ...,
        [0.],
        [0.],
        [0.]])

In [21]:
x_test

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
y_train

matrix([[1.],
        [1.],
        [1.],
        ...,
        [2.],
        [2.],
        [2.]])

In [23]:
x_train

matrix([[0.03912517, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.03912517, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.03912517, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [31]:
model = Sequential()
model.add(Dense(256, input_shape=(len(coder.vectorizer.vocabulary_.keys()),), activation="sigmoid"))
model.add(Dense(128, activation="sigmoid"))
model.add(Dense(64, activation="sigmoid"))
model.add(Dense(df['position'].nunique(), activation="softmax"))

In [32]:
INIT_LR = 0.01
EPOCHS = 6

opt = SGD(lr=INIT_LR)

model.compile(loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"])

In [33]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 256)               10341888  
_________________________________________________________________
dense_9 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_10 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_11 (Dense)             (None, 3)                 195       
Total params: 10,383,235
Trainable params: 10,383,235
Non-trainable params: 0
_________________________________________________________________


In [34]:
model.fit(x_train, y_train, epochs=EPOCHS, batch_size=4)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x203e759a5c0>

In [35]:
model.save("model.h5")