In [219]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from tqdm import tqdm
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")


In [237]:
LONGEST_ARTICLE = 190
NUM_CLASSES = 4
lr = 1e-4

In [238]:
#preprocess the data 
def pre_data(filename):
    train_sentences = []
    train_y = []
    with open(filename, "r") as data: 
        line = data.readline()
        while line:
            y,title,body = line.split("\",\"")
            y = int(y[1:])-1
            text ="{} {}".format(title,body).split(" ")
            text = [stemmer.stem(t.strip()) for t in text] 
            train_sentences.append(text)
            train_y.append(y)
            line = data.readline()
    return train_sentences, train_y
                
#know longest article is 197 words and that there are 141,206 words

In [257]:
def pre_data_test(filename):
    test_sentences = []
    test_y = []
    with open(filename, "r") as data: 
        line = data.readline()
        while line:
            y,title,body = line.split("\",\"")
            y = int(y[1:])-1
            text ="{} {}".format(title,body).split(" ")
            text = [stemmer.stem(t.strip()) for t in text] 
            test_sentences.append(text)
            test_y.append(y)
            line = data.readline()
    return test_sentences, test_y

In [240]:
train_sentences, y = pre_data("/home/jonny/Documents/CurroML/HW5/ag-news-csv/ag_news_csv/train.csv")

In [241]:
train_sentencesNP = np.array(train_sentences)
yC = keras.utils.to_categorical(y)
randos = np.random.choice(len(train_sentencesNP),len(train_sentencesNP),replace=False)
validation_randos = randos[:20000]
train_randos = randos[20000:]
train_x = train_sentencesNP[train_randos]
train_y = yC[train_randos]
val_x = train_sentencesNP[validation_randos]
val_y = yC[validation_randos]

In [242]:
train_x = np.append(train_x, "ENDOFSENTENCETOKEN")

In [243]:
t = keras.preprocessing.text.Tokenizer()
t.fit_on_texts(train_x)

In [244]:
train_sequences = t.texts_to_sequences(train_x)
validation_sequences = t.texts_to_sequences(val_x)

In [245]:
EOS_TOKEN = train_sequences.pop()[0]

In [247]:
def add_padding(sequences, EOS_TOKEN, MAX_LENGTH):
    padded_sequences = np.zeros((len(sequences),MAX_LENGTH))
    for seq in tqdm(range(len(sequences))):
        len_seq = len(sequences[seq])
        if len_seq>MAX_LENGTH:
            padded_sequences[seq] =  sequences[seq][:MAX_LENGTH]
        else:
            padded_sequences[seq] =  sequences[seq] + (MAX_LENGTH - len_seq )*[EOS_TOKEN]
    return padded_sequences

In [248]:
padded_sequences = add_padding(train_sequences,EOS_TOKEN,LONGEST_ARTICLE)

100%|██████████| 100000/100000 [00:01<00:00, 52629.87it/s]


In [249]:
padded_val_sequences = add_padding(validation_sequences, EOS_TOKEN,LONGEST_ARTICLE)

100%|██████████| 20000/20000 [00:00<00:00, 60200.11it/s]


In [None]:
##THT IS A LIST OF OTHER MODELS THAT DID JUST A TAD BETTER THAN th

In [250]:
#this is an attempt to make a really small model, after 32 epochs get 90.8% on val.
model = keras.Sequential()
model.add(keras.layers.Embedding(len(t.word_index)+1,4, input_length=LONGEST_ARTICLE))
model.add(keras.layers.Conv1D(filters=32,kernel_size=2, dilation_rate=2, padding='valid',activation='elu'))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(NUM_CLASSES,activation="softmax"))
model.compile(optimizer=keras.optimizers.Adam(lr),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.summary()
model.fit(padded_sequences,train_y, epochs = 16, batch_size = 512, validation_data=(padded_val_sequences,val_y),verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_32 (Embedding)     (None, 190, 4)            510900    
_________________________________________________________________
conv1d_30 (Conv1D)           (None, 188, 32)           288       
_________________________________________________________________
flatten_28 (Flatten)         (None, 6016)              0         
_________________________________________________________________
dense_28 (Dense)             (None, 4)                 24068     
Total params: 535,256
Trainable params: 535,256
Non-trainable params: 0
_________________________________________________________________
Train on 100000 samples, validate on 20000 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16

In [259]:
test_x,test_y = pre_data_test("/home/jonny/Documents/CurroML/HW5/ag-news-csv/ag_news_csv/test.csv")
test_sequences = t.texts_to_sequences(test_x)
test_yC = keras.utils.to_categorical(test_y)
padded_test = add_padding(test_sequences,EOS_TOKEN,LONGEST_ARTICLE)

100%|██████████| 7600/7600 [00:00<00:00, 55178.66it/s]


In [260]:
#This final accuracy is on the SMALL MODEL 
model.evaluate(padded_test, test_yC)



[0.5904176346879256, 0.7734210526315789]