In [1]:
# Load Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Dense,Embedding,LSTM,Dropout,Bidirectional,Input,BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping,ModelCheckpoint

Using TensorFlow backend.


In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
challenges = pd.read_csv('challenge_data.csv')
train.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933


In [3]:
target = train[train.challenge_sequence > 10][['user_id','challenge']]
target.rename(columns={'challenge':'target'},inplace=True)
target.head()

Unnamed: 0,user_id,target
10,4576,CI24958
11,4576,CI23667
12,4576,CI23691
23,4580,CI24915
24,4580,CI25727


In [4]:
data = train[train.challenge_sequence <= 10].groupby('user_id').challenge.aggregate(lambda x: ' '.join(x)).reset_index()
data.head()

Unnamed: 0,user_id,challenge
0,4576,CI23714 CI23855 CI24917 CI23663 CI23933 CI2513...
1,4580,CI23663 CI23855 CI23933 CI23975 CI24530 CI2371...
2,4581,CI26155 CI26156 CI26157 CI26158 CI26159 CI2616...
3,4582,CI23855 CI24915 CI24917 CI23933 CI23663 CI2495...
4,4585,CI23855 CI23975 CI24917 CI25135 CI23848 CI2371...


In [5]:
data = data.merge(target)
data.head()

Unnamed: 0,user_id,challenge,target
0,4576,CI23714 CI23855 CI24917 CI23663 CI23933 CI2513...,CI24958
1,4576,CI23714 CI23855 CI24917 CI23663 CI23933 CI2513...,CI23667
2,4576,CI23714 CI23855 CI24917 CI23663 CI23933 CI2513...,CI23691
3,4580,CI23663 CI23855 CI23933 CI23975 CI24530 CI2371...,CI24915
4,4580,CI23663 CI23855 CI23933 CI23975 CI24530 CI2371...,CI25727


In [6]:
# Encode challenges
encoder = LabelEncoder()
encoder.fit(challenges['challenge_ID'])
data['challange_encoded'] = encoder.transform(data.target)

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['challenge'])

In [8]:
NB_WORDS = len(tokenizer.word_index)
MAX_SEQUENCE_LENGTH = 10
N_CATEGORIES = challenges.shape[0]
    
# Create sequences
sequences_train = tokenizer.texts_to_sequences(data['challenge'])

In [9]:
x_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

In [10]:
y_train = data['challange_encoded'].values

In [11]:
def get_model(path='',lr=0.001):
        adam = Adam(lr=lr)
        inp = Input(shape=(MAX_SEQUENCE_LENGTH, ))
        x = Embedding(NB_WORDS+1,256)(inp)
        x = BatchNormalization()(x)
        x = Bidirectional(LSTM(256, dropout=0.1, recurrent_dropout=0.1))(x)
        x = Dropout(0.4)(x)
        x = Dense(N_CATEGORIES, activation="softmax")(x)
        model = Model(inputs=inp, outputs=x)
        if path != '':
            model.load_weights(path)
        model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
        return model
    
# Initialize the model
model = get_model()

In [12]:
model.fit(x_train,
          y_train,
          epochs=12,
          batch_size=2048
         )

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.callbacks.History at 0x7fcc56d56fd0>

In [13]:
def padding(text):
        return pad_sequences(tokenizer.texts_to_sequences(text), maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    
test_text = test[test.challenge_sequence <= 10].groupby('user_id').challenge.aggregate(lambda x: ' '.join(x)).reset_index()
x_test = padding(test_text.challenge)

In [14]:
pred = model.predict(x_test,batch_size=2048)
pred = pred.argsort(axis=1)[:,-3:][:,::-1]

In [15]:
pred_list = []
for i in range(3):
    test1 = test_text[['user_id']]
    test1['user_sequence'] = test1.user_id.astype(str) + '_'+str(i+11)
    test1['challenge'] = encoder.inverse_transform(pred[:,i])
    pred_list.append(test1[['user_sequence','challenge']])
    
pd.concat(pred_list).to_csv('sub16.csv',index=False)