In [None]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from sklearn.model_selection import cross_val_score, train_test_split
import datetime
from sklearn.metrics import mean_squared_error

In [None]:
train = pd.read_csv('train_clean.csv', low_memory=False, encoding='latin1')
final_out=pd.read_csv('Devex_submission_format.csv', low_memory=False, encoding='latin1')
test = pd.read_csv("test_clean.csv",low_memory=False, encoding='latin1')

In [None]:
embed_size = 50 # how big is each word vector
max_features = 30000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 200 # max number of words in a comment to use

In [None]:
print('Remove NaNs')
list_sentences_train = train["Text"].fillna("_na_").values
list_classes = [
 '3.1.1',
 '3.1.2',
 '3.2.1',
 '3.2.2',
 '3.3.1',
 '3.3.2',
 '3.3.3',
 '3.3.4',
 '3.3.5',
 '3.4.1',
 '3.4.2',
 '3.5.1',
 '3.5.2',
 '3.6.1',
 '3.7.1',
 '3.7.2',
 '3.8.1',
 '3.8.2',
 '3.9.1',
 '3.9.2',
 '3.9.3',
 '3.a.1',
 '3.b.1',
 '3.b.2',
 '3.b.3',
 '3.c.1',
 '3.d.1']
y = train[list_classes].values
list_sentences_test = test["Text"].fillna("_na_").values

In [None]:
print('Tokenizing')
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [None]:
print('Padding')
X_tra = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_tes = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [None]:
print('Retrieving model')
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(LSTM(8, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(8, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(27, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
X_tra.shape

In [None]:
# Perform the train / test split
X_train, X_val, y_train, y_val = train_test_split(X_tra, y, test_size=0.3, random_state=0)

In [None]:
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

In [None]:

model.fit(X_train, y_train, batch_size=8, epochs=10)
y_val_pred = model.predict(X_val)

 

In [None]:
y_test = model.predict(X_tes)