In [1]:
# Colab cell 1 — check GPU
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("GPU available:", tf.config.list_physical_devices('GPU'))


TensorFlow version: 2.19.0
GPU available: []


In [2]:
# Colab cell 2 — install extras (Colab already has TF/Pandas, but these ensure versions)
!pip install -q kaggle gensim

# then imports and nltk downloads
import re
import numpy as np
import pandas as pd
import nltk
from google.colab import files, drive
nltk.download('punkt')
nltk.download('stopwords')


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.2/38.2 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
tsfresh 0.21.1 requires scipy>=1.14.0; python_version >= "3.10", but you have scipy 1.13.1 wh

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Colab cell 3A
from google.colab import files
uploaded = files.upload()  # click choose files and upload bbc-text.csv
# after upload, read it (replace filename if different)
import io
df = pd.read_csv(io.BytesIO(list(uploaded.values())[0]))
df.head()


Saving bbc.csv to bbc.csv


Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [5]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Colab cell 4 — cleaning + small checks
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def clean_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)           # remove non-alpha
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
    return ' '.join(tokens)

# apply
df['cleaned_text'] = df['text'].astype(str).apply(clean_text)
df[['category','cleaned_text']].head()


Unnamed: 0,category,cleaned_text
0,tech,tv future hands viewers home theatre systems p...
1,business,worldcom boss left books alone former worldcom...
2,sport,tigers wary farrell gamble leicester say rushe...
3,sport,yeading face newcastle fa cup premiership side...
4,entertainment,ocean twelve raids box office ocean twelve cri...


In [7]:
# Colab cell 5 — tokenize & pad
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_NUM_WORDS = 20000   # top words to keep
MAX_SEQUENCE_LENGTH = 500  # trunc/pad length (try 500; reduce if memory issues)

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token=None)
tokenizer.fit_on_texts(df['cleaned_text'])
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])

X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
word_index = tokenizer.word_index
print("X shape:", X.shape, "Vocab size:", len(word_index))


X shape: (2225, 500) Vocab size: 27738


In [8]:
# Colab cell 6 — labels
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

le = LabelEncoder()
y = le.fit_transform(df['category'].astype(str))
y = to_categorical(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(1780, 500) (445, 500) (1780, 5) (445, 5)


In [9]:
# Colab cell 7 — download GloVe 100d and prepare embedding matrix
!wget -q http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip -d glove

EMBEDDING_DIM = 100
emb_path = 'glove/glove.6B.100d.txt'

# load embeddings
emb_index = {}
with open(emb_path, 'r', encoding='utf-8') as f:
    for line in f:
        vals = line.split()
        word = vals[0]
        vec = np.asarray(vals[1:], dtype='float32')
        emb_index[word] = vec

# build embedding matrix (only for top MAX_NUM_WORDS)
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= num_words:
        continue
    vec = emb_index.get(word)
    if vec is not None:
        embedding_matrix[i] = vec

print("Embedding matrix shape:", embedding_matrix.shape)


Embedding matrix shape: (20000, 100)


In [10]:
# Colab cell 8 — model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

num_words = embedding_matrix.shape[0]
model = Sequential()
model.add(Embedding(input_dim=num_words,
                    output_dim=EMBEDDING_DIM,
                    weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False))          # set True to fine-tune
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(y_train.shape[1], activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()




In [11]:
# Colab cell 9 — train
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ModelCheckpoint('best_lstm.h5', save_best_only=True, monitor='val_loss')
]

history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=10,
    batch_size=32,
    callbacks=callbacks
)


Epoch 1/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.5320 - loss: 1.2314



[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 2s/step - accuracy: 0.5352 - loss: 1.2248 - val_accuracy: 0.8708 - val_loss: 0.3526
Epoch 2/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.8878 - loss: 0.3656



[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 2s/step - accuracy: 0.8881 - loss: 0.3646 - val_accuracy: 0.9438 - val_loss: 0.1944
Epoch 3/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.9091 - loss: 0.2849



[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 2s/step - accuracy: 0.9091 - loss: 0.2846 - val_accuracy: 0.9551 - val_loss: 0.1312
Epoch 4/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.9491 - loss: 0.1766



[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 2s/step - accuracy: 0.9490 - loss: 0.1769 - val_accuracy: 0.9719 - val_loss: 0.0808
Epoch 5/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 2s/step - accuracy: 0.9524 - loss: 0.1634 - val_accuracy: 0.9663 - val_loss: 0.0927
Epoch 6/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 2s/step - accuracy: 0.9514 - loss: 0.1445 - val_accuracy: 0.9719 - val_loss: 0.0942
Epoch 7/10
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 2s/step - accuracy: 0.9530 - loss: 0.1298 - val_accuracy: 0.9494 - val_loss: 0.1179


In [12]:
# Colab cell 10 — evaluate and save to Drive
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print("Test Loss:", test_loss, "Test Acc:", test_acc)

# save model to Drive (if mounted)
model.save('/content/best_lstm_model.h5')
# to copy to drive:
# from google.colab import drive
# drive.mount('/content/drive')
# model.save('/content/drive/MyDrive/best_lstm_model.h5')


14/14 - 7s - 504ms/step - accuracy: 0.9573 - loss: 0.1553




Test Loss: 0.15531978011131287 Test Acc: 0.9573033452033997
