In [108]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Connect Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [133]:
# Read in file
dataset = pd.read_csv('/content/drive/MyDrive/large_dataset.csv', header=None, names=['text', 'label'])

In [134]:
# Split data for training and testing randomly
train_dataset, test_dataset = train_test_split(dataset, test_size=0.15, random_state=42)

In [135]:
# Splitting training and testing data into data and classification
X_train = train_dataset['text'].values
y_train = train_dataset['label'].values
X_test = test_dataset['text'].values
y_test = test_dataset['label'].values

In [136]:
# Use Tokenizer to identify words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(np.concatenate((X_train, X_test)))

# Replace words with matched int
X_train_int = tokenizer.texts_to_sequences(X_train)
X_test_int = tokenizer.texts_to_sequences(X_test)

In [137]:
# Use label encoder to change text labels into ints [0, 1, 2, 3, 4, 5]
label_encoder = LabelEncoder()

y_train_int = label_encoder.fit_transform(y_train)
y_test_int = label_encoder.transform(y_test)

In [138]:
# Get longest article
max_len = max([len(x) for x in X_train_int + X_test_int])
print(max_len)

1777


In [139]:
# Get the maximum number of words in dataset
max_features = len(tokenizer.word_index) + 1
print(max_features)

8999


In [140]:
# Pad text to have same length
x_train_input = tf.keras.preprocessing.sequence.pad_sequences(X_train_int, maxlen=max_len)
x_test_input = tf.keras.preprocessing.sequence.pad_sequences(X_test_int, maxlen=max_len)

In [141]:
# Define Model
NEWS_Model = keras.Sequential([
    keras.layers.Embedding(input_dim=max_features, output_dim=32, input_length=max_len),
    keras.layers.LSTM(32, return_sequences=True, activation='tanh', recurrent_activation='sigmoid'),
    keras.layers.LSTM(32, return_sequences=False, activation='tanh', recurrent_activation='sigmoid'),
    keras.layers.Dense(6, activation='softmax')
])

In [142]:
# Compile Model
NEWS_Model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
NEWS_Model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 1777, 32)          287968    
                                                                 
 lstm_18 (LSTM)              (None, 1777, 32)          8320      
                                                                 
 lstm_19 (LSTM)              (None, 32)                8320      
                                                                 
 dense_9 (Dense)             (None, 6)                 198       
                                                                 
Total params: 304,806
Trainable params: 304,806
Non-trainable params: 0
_________________________________________________________________


In [155]:
# Train Model (2x 10 epochs 1x 5 epochs)
history = NEWS_Model.fit(x_train_input, y_train_int, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [156]:
test_loss, test_accuracy = NEWS_Model.evaluate(x_test_input, y_test_int)



In [157]:
NEWS_Model.save("/content/drive/MyDrive/Saved_News_AI")

