In [6]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Load dataset from Google Drive
dataset_path = '/content/drive/MyDrive/Machine Learning/BBC-Sports-News-Classification/bbcsports.csv'
data = pd.read_csv(dataset_path)
data.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0.1,Unnamed: 0,text,label
0,0,Sharapova overcomes tough Molik\n\nWimbledon c...,tennis
1,1,GB players warned over security\n\nBritain's D...,tennis
2,2,Federer wins title in Rotterdam\n\nWorld numbe...,tennis
3,3,Mauresmo fights back to win title\n\nWorld num...,tennis
4,4,Agassi into second round in Dubai\n\nFourth se...,tennis


In [10]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

texts = data['text']
labels = data['label']

tokenizer = Tokenizer(num_words=15000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

X = pad_sequences(sequences, maxlen=200)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
y = tf.keras.utils.to_categorical(y)

# Split into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)


In [24]:
# Count the number of instances for each class label
data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
football,265
rugby,147
cricket,124
athletics,101
tennis,100


In [11]:
len(tokenizer.word_index)

14225

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Flatten

# Build the model
model = Sequential()
model.add(Embedding(input_dim=15000, output_dim=64, input_length=X_train.shape[1]))  # Adjust input_length and input_dim as per your tokenizer
model.add(Flatten())
model.add(Dense(128, activation='relu'))  # First hidden layer
model.add(Dense(64, activation='relu'))   # Second hidden layer
model.add(Dense(32, activation='relu'))   # Third hidden layer
model.add(Dense(5, activation='softmax'))  # Output layer with 5 neurons (one for each class)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=8, batch_size=32, validation_data=(X_test, y_test))  # Use the test set for validation


Epoch 1/8




[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 65ms/step - accuracy: 0.2924 - loss: 1.5873 - val_accuracy: 0.3581 - val_loss: 1.4885
Epoch 2/8
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.4627 - loss: 1.1441 - val_accuracy: 0.3986 - val_loss: 1.3545
Epoch 3/8
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.9729 - loss: 0.2716 - val_accuracy: 0.4730 - val_loss: 1.4077
Epoch 4/8
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 1.0000 - loss: 0.0074 - val_accuracy: 0.5000 - val_loss: 1.7126
Epoch 5/8
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 1.0000 - loss: 7.7132e-04 - val_accuracy: 0.5405 - val_loss: 1.6794
Epoch 6/8
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step - accuracy: 1.0000 - loss: 4.6775e-04 - val_accuracy: 0.5338 - val_loss: 1.6235
Epoch 7/8
[1m19/19[0m [32m━━━━━━━━━━━━━

In [19]:
# Evaluate on training data
train_loss, train_accuracy = model.evaluate(X_train, y_train)
print(f'Training Accuracy: {train_accuracy}')

# Evaluate on test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy}')


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 1.0000 - loss: 1.7568e-04
Training Accuracy: 1.0
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5346 - loss: 1.5755 
Test Accuracy: 0.5270270109176636


In [22]:
from sklearn.metrics import classification_report
import numpy as np

# Get model predictions
y_pred = model.predict(X_test)

# Convert predictions from one-hot encoded format to class labels
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Display classification report
report = classification_report(y_test_classes, y_pred_classes, target_names=data['label'].unique())

print(report)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
              precision    recall  f1-score   support

      tennis       0.78      0.35      0.48        20
       rugby       0.85      0.44      0.58        25
    football       0.46      0.94      0.62        53
     cricket       0.50      0.27      0.35        30
   athletics       1.00      0.10      0.18        20

    accuracy                           0.53       148
   macro avg       0.72      0.42      0.44       148
weighted avg       0.65      0.53      0.48       148

