In [0]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from tqdm import tqdm
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")


In [0]:
LONGEST_ARTICLE = 190
NUM_CLASSES = 4
lr = 1e-4

In [0]:
#preprocess the data 
def pre_data(filename):
    train_sentences = []
    train_y = []
    with open(filename, "r") as data: 
        line = data.readline()
        while line:
            y,title,body = line.split("\",\"")
            y = int(y[1:])-1
            text ="{} {}".format(title,body).split(" ")
            text = [stemmer.stem(t.strip()) for t in text] 
            train_sentences.append(text)
            train_y.append(y)
            line = data.readline()
    return train_sentences, train_y
                
#know longest article is 197 words and that there are 141,206 words

In [0]:
train_sentences, y = pre_data("train.csv")

In [0]:
train_sentencesNP = np.array(train_sentences)
yC = keras.utils.to_categorical(y)
randos = np.random.choice(len(train_sentencesNP),len(train_sentencesNP),replace=False)
validation_randos = randos[:20000]
train_randos = randos[20000:]
train_x = train_sentencesNP[train_randos]
train_y = yC[train_randos]
val_x = train_sentencesNP[validation_randos]
val_y = yC[validation_randos]

In [0]:
train_x = np.append(train_x, "ENDOFSENTENCETOKEN")

In [0]:
t = keras.preprocessing.text.Tokenizer()
t.fit_on_texts(train_x)

In [0]:
train_sequences = t.texts_to_sequences(train_x)
validation_sequences = t.texts_to_sequences(val_x)

In [0]:
EOS_TOKEN = train_sequences.pop()[0]

In [0]:
def add_padding(sequences, EOS_TOKEN, MAX_LENGTH):
    padded_sequences = np.zeros((len(sequences),MAX_LENGTH))
    for seq in tqdm(range(len(sequences))):
        len_seq = len(sequences[seq])
        if len_seq>MAX_LENGTH:
            padded_sequences[seq] =  sequences[seq][:MAX_LENGTH]
        else:
            padded_sequences[seq] =  sequences[seq] + (MAX_LENGTH - len_seq )*[EOS_TOKEN]
    return padded_sequences

In [47]:
padded_sequences = add_padding(train_sequences,EOS_TOKEN,LONGEST_ARTICLE)

100%|██████████| 100000/100000 [00:01<00:00, 53531.04it/s]


In [48]:
padded_val_sequences = add_padding(validation_sequences, EOS_TOKEN,LONGEST_ARTICLE)

100%|██████████| 20000/20000 [00:00<00:00, 51193.41it/s]


In [0]:
##THIS IS A LIST OF OTHER MODELS THAT DID JUST A TAD BETTER THAN THE SMALL LIST, BUT WITH MANY MORE PARAMETERS
#with out any convolutions get some good results, after 6 epochs get 91.6% on val set.  
# model = keras.Sequential()
# model.add(keras.layers.Embedding(len(t.word_index)+1,512, input_length=LONGEST_ARTICLE))
# model.add(keras.layers.Flatten())
# model.add(keras.layers.Dense(NUM_CLASSES,activation="softmax"))
# model.compile(optimizer=keras.optimizers.Adam(lr),
#               loss='categorical_crossentropy',
#               metrics=['accuracy'])

In [0]:
#This is also pretty good get, after 6 epochs get 91%
# model = keras.Sequential()
# model.add(keras.layers.Embedding(len(t.word_index)+1,256, input_length=LONGEST_ARTICLE))
# model.add(keras.layers.Conv1D(filters=32,kernel_size=4, dilation_rate=8, padding='valid',activation='elu'))
# model.add(keras.layers.Flatten())
# model.add(keras.layers.Dense(NUM_CLASSES,activation="softmax"))
# model.compile(optimizer=keras.optimizers.Adam(lr),
#               loss='categorical_crossentropy',
#               metrics=['accuracy'])

In [0]:
#this is an attempt to make a really small model, after 32 epochs get 90.8% on val.
model = keras.Sequential()
model.add(keras.layers.Embedding(len(t.word_index)+1,4, input_length=LONGEST_ARTICLE))
model.add(keras.layers.Conv1D(filters=32,kernel_size=2, dilation_rate=2, padding='valid',activation='relu'))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(NUM_CLASSES,activation="softmax"))
model.compile(optimizer=keras.optimizers.Adam(lr),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [50]:
model.summary()
model.fit(padded_sequences,train_y, epochs = 32, batch_size = 512, validation_data=(padded_val_sequences,val_y),verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 190, 4)            510492    
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 188, 32)           288       
_________________________________________________________________
flatten_9 (Flatten)          (None, 6016)              0         
_________________________________________________________________
dense_9 (Dense)              (None, 4)                 24068     
Total params: 534,848
Trainable params: 534,848
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 100000 samples, validate on 20000 samples
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<tensorflow.python.keras.callbacks.History at 0x7f1543d126a0>

In [0]:
def pre_data_test(filename):
    test_sentences = []
    test_y = []
    with open(filename, "r") as data: 
        line = data.readline()
        while line:
            y,title,body = line.split("\",\"")
            y = int(y[1:])-1
            text ="{} {}".format(title,body).split(" ")
            text = [stemmer.stem(t.strip()) for t in text] 
            test_sentences.append(text)
            test_y.append(y)
            line = data.readline()
    return test_sentences, test_y

In [54]:
test_x,test_y = pre_data_test("test.csv")
test_sequences = t.texts_to_sequences(test_x)
test_yC = keras.utils.to_categorical(test_y)
padded_test = add_padding(test_sequences,EOS_TOKEN,LONGEST_ARTICLE)

100%|██████████| 7600/7600 [00:00<00:00, 52505.23it/s]


In [55]:
model.evaluate(padded_test, test_yC)



[0.25492076643987704, 0.9153947368421053]

Test Accuracy of Small Model :  91.5%
