In [25]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.models import save_model
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Dropout
from keras import regularizers
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [26]:
import pandas as pd

df = pd.read_csv('Downloads/train.csv')

In [27]:
X = df['text']
y = df['stars']

In [28]:
X = list(X)
y = list(y)

In [29]:
Y = []

for q in y:
    if(q==1 or q==2):
        Y.append(0)
    elif(q==4 or q==5):
        Y.append(2)
    else:
        Y.append(1)

'Y = []\n\nfor q in y:\n    if(q==1 or q==2):\n        Y.append(0)\n    elif(q==4 or q==5):\n        Y.append(2)\n    else:\n        Y.append(1)'

In [30]:
from keras.utils import to_categorical

Y = to_categorical(y, num_classes=3)
y = array(y)

In [31]:
from nltk.corpus import stopwords
import string
import re


def clean_doc(X):
# split into tokens by white space
    i = 0
    for x in X:
        tokens = x.split()
# prepare regex for char filtering
        re_punc = re.compile( ' [%s] ' % re.escape(string.punctuation))
# remove punctuation from each word
        tokens = [re_punc.sub( '' , w) for w in tokens]
# remove remaining tokens that are not alphabetic
        tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
        stop_words = set(stopwords.words('english'))
        tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
        tokens = [word for word in tokens if len(word) > 1]
        tok = (' ').join(tokens)
        X[i] = tok
        i += 1
        
    return X

In [32]:
def update_vocab(X , vocab):
    for x in X:
        tokens = x.split()
        vocab.update(tokens)

In [33]:
from collections import Counter 
vocab = Counter()

X = clean_doc(X)

In [34]:
update_vocab(X, vocab)

In [35]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [36]:
def encode_docs(tokenizer, max_length, docs):
    # integer encode
    encoded = tokenizer.texts_to_sequences(docs)
    # pad sequences
    padded = pad_sequences(encoded, maxlen=max_length, padding= 'post' )
    return padded

In [37]:
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 500, input_length=max_length))
    model.add(Conv1D(filters=64, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(20, activation= 'relu',kernel_regularizer=regularizers.l2(0.1)))
    model.add(Dropout(0.5))
    model.add(Dense(10, activation= 'relu',kernel_regularizer=regularizers.l2(0.1)))
    model.add(Dropout(0.25))
    model.add(Dense(3, activation='sigmoid'))
    # compile network
    model.compile(loss= 'binary_crossentropy' , optimizer= 'adam' , metrics=['accuracy'])
    # summarize defined model
    model.summary()
    return model

In [38]:
tokenizer = create_tokenizer(X)
vocab_size = len(tokenizer.word_index) + 1

In [39]:
max_length = max([len(s.split()) for s in X])

In [40]:
X = encode_docs(tokenizer, max_length, X)

In [41]:
model = define_model(vocab_size, max_length)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 414, 500)          10837000  
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 407, 64)           256064    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 203, 64)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 196, 32)           16416     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 98, 32)            0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 3136)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 20)                62740     
__________

In [43]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True)
skf.get_n_splits(X, y)

i=1
for train, test in skf.split(X, y):
    print("Running Fold", i, "/", 5)
    model1 = model
    model1.fit(X[train], Y[train], epochs=5, verbose=2)
    history = model1.evaluate(X[test], Y[test])
    print(history)
    i += 1
'''model.fit(X,Y, epochs=10, verbose=1, validation_split=0.2)'''

Running Fold 1 / 5
Epoch 1/5
 - 117s - loss: 1.3674 - acc: 0.7832
Epoch 2/5
 - 117s - loss: 0.6209 - acc: 0.7963
Epoch 3/5
 - 118s - loss: 0.4913 - acc: 0.8025
Epoch 4/5
 - 118s - loss: 0.3973 - acc: 0.8500
Epoch 5/5
 - 124s - loss: 0.2899 - acc: 0.8949
[0.7983173184740966, 0.7296351821704962]
Running Fold 2 / 5
Epoch 1/5
 - 127s - loss: 0.3562 - acc: 0.8746
Epoch 2/5
 - 124s - loss: 0.2617 - acc: 0.9026
Epoch 3/5
 - 126s - loss: 0.1956 - acc: 0.9205
Epoch 4/5
 - 119s - loss: 0.1575 - acc: 0.9298
Epoch 5/5
 - 117s - loss: 0.1273 - acc: 0.9342
[0.26821817135227016, 0.939530234971921]
Running Fold 3 / 5
Epoch 1/5
 - 119s - loss: 0.1954 - acc: 0.9284
Epoch 2/5
 - 119s - loss: 0.1401 - acc: 0.9694
Epoch 3/5
 - 121s - loss: 0.1060 - acc: 0.9807
Epoch 4/5
 - 117s - loss: 0.1118 - acc: 0.9816
Epoch 5/5
 - 118s - loss: 0.1125 - acc: 0.9827
[0.11808378368616104, 0.98175]
Running Fold 4 / 5
Epoch 1/5


KeyboardInterrupt: 

In [44]:
model1.save('Funny_Model.h5')

In [45]:
import pickle

with open('TOKENIZER.pickle', 'wb') as f:
    pickle.dump(tokenizer, f)