# RNN model

> run a RMM format model.

I fork this notebook from https://www.kaggle.com/nikitagrec/bowl-lstm-prediction.

In [1]:
import pandas as pd
import numpy as np
import json
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from scipy.ndimage.filters import gaussian_filter
import warnings
import random
import plotly.express as px
warnings.filterwarnings("ignore")
sns.set_style("whitegrid")
my_pal = sns.color_palette(n_colors=10)

from pathlib import Path
data_path = Path('data/')

## inputs

In [2]:
n = 11341042 #number of records in file
s = 2000000 #desired sample size
skip = sorted(random.sample(range(n),n-s)) # 这一步没有必要吧。

In [3]:
train = pd.read_csv(data_path/'train.csv', skiprows=skip)
# Line numbers to skip (0-indexed) or number of lines to skip (int)
#     at the start of the file.
train.columns = ['event_id','game_session','timestamp','event_data',
            'installation_id','event_count','event_code','game_time','title','type','world']

In [4]:
test = pd.read_csv(data_path/'test.csv')
labels = pd.read_csv(data_path/'train_labels.csv')

In [5]:
sample_submission = pd.read_csv(data_path/"sample_submission.csv")

In [6]:
print(train.shape, test.shape, labels.shape)
print(str(labels.shape[0]) + '是训练集的size')

(2000000, 11) (1156414, 11) (17690, 7)
17690是训练集的size


In [7]:
print(train.columns)
print(labels.columns)
print(test.columns)
print(sample_submission.columns)

Index(['event_id', 'game_session', 'timestamp', 'event_data',
       'installation_id', 'event_count', 'event_code', 'game_time', 'title',
       'type', 'world'],
      dtype='object')
Index(['game_session', 'installation_id', 'title', 'num_correct',
       'num_incorrect', 'accuracy', 'accuracy_group'],
      dtype='object')
Index(['event_id', 'game_session', 'timestamp', 'event_data',
       'installation_id', 'event_count', 'event_code', 'game_time', 'title',
       'type', 'world'],
      dtype='object')
Index(['installation_id', 'accuracy_group'], dtype='object')


### LSTM Experiments

In [8]:
from more_itertools import sliced
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, SpatialDropout1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.callbacks.callbacks import EarlyStopping

Using TensorFlow backend.


In [9]:
full = train.merge(labels, how='right', on=['installation_id','game_session']) # 一对多匹配
train_ls = full[['installation_id','game_session','event_id']] # 只提取
# convert to str
train_ls['event_id'] = train_ls['event_id'].apply(lambda x: str(x))

In [10]:
del train

In [11]:
# export
def events_all(aa):
    xx = ''
    for i in aa: 
        xx += i + ' '
    xx = xx.rstrip()
    return xx

In [13]:
result = train_ls.groupby(['installation_id','game_session']).sum().reset_index()
result['event_id'] = result['event_id'].apply(lambda x: list(sliced(x, 8)))
result['new_event'] = result['event_id'].apply(events_all)
result = result.merge(labels, how='right', on=['installation_id','game_session'])[['new_event','accuracy_group']]

In [14]:
result.head() # event 是一个序列特征

Unnamed: 0,new_event,accuracy_group
0,a1e4395d 7da34a02 fbaf3456 fbaf3456,3
1,1375ccb7 e37a2b78 4a4c3d21 a16a373e e37a2b78 4...,0
2,3bfd1a65 28ed704e c74f40cd c7128948 7da34a02 f...,3
3,3bfd1a65 a1e4395d 9d29771f 28ed704e a1e4395d a...,2
4,4a4c3d21 8fee50e2 8fee50e2 f6947f54,3


In [15]:
result.shape

(17690, 2)

In [17]:
result[['new_event']].to_csv("output/event_features.csv", index = False)

In [18]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 100
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 500
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(result['new_event'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 98 unique tokens.


In [19]:
X = tokenizer.texts_to_sequences(result['new_event'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (17690, 500)


In [20]:
Y = pd.get_dummies(result['accuracy_group']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (17690, 4)


In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(15921, 500) (15921, 4)
(1769, 500) (1769, 4)


In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(LSTM(50, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# choose epochs and batch_size
epochs = 5
batch_size = 64
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.001)])

Train on 14328 samples, validate on 1593 samples
Epoch 1/5
  896/14328 [>.............................] - ETA: 1:53 - loss: 1.3001 - accuracy: 0.4922

In [None]:
accr = model.evaluate(X_train,Y_train)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

没有过拟合

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

### Test prediction:

In [None]:
last_test = test[['installation_id','game_session',
                  'timestamp']].groupby(['installation_id']).tail(1)[['installation_id','game_session']]
test_ = test.merge(last_test,how='inner', on=['installation_id','game_session'])

In [None]:
test_ls = test_[['installation_id','game_session','event_id']]
# test_ls = test[['installation_id','game_session','event_id']]
test_ls['event_id'] = test_ls['event_id'].apply(lambda x: str(x))
res_test = test_ls.groupby(['installation_id','game_session']).sum().reset_index()
res_test['event_id'] = res_test['event_id'].apply(lambda x: list(sliced(x, 8)))
res_test['new_event'] = res_test['event_id'].apply(events_all)

In [None]:
X_ts = tokenizer.texts_to_sequences(res_test['new_event'].values)
X_ts = pad_sequences(X_ts, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_ts.shape)

In [None]:
test_pred = model.predict(X_ts)

In [None]:
submission = pd.concat([res_test['installation_id'],
                                     pd.DataFrame(test_pred).idxmax(1)], axis=1)
submission.columns = ['installation_id','accuracy_group']

In [None]:
submission.to_csv('submission.csv', index=None)
submission.head()

In [None]:
submission['accuracy_group'].hist();

## save model

In [None]:
model.__class__

In [None]:
model.save("model/keras-ltsm-1.0.1.h5")

## save embedding

In [None]:
print(X.shape,Y.shape)

In [None]:
X[0:5,0:5]

In [None]:
Y[0:5]

In [None]:
np.save("output/embedding-train-x", X)
np.save("output/embedding-train-y", Y)