In [1]:
# The first module: data loading module
import pandas as pd

# Step 1: Load the dataset
file_path = "bbc-text.csv"
bbc_data = pd.read_csv(file_path, encoding='latin-1')

# Step 2: Basic overview of the dataset
print("Dataset Information:")
print(bbc_data.info())  # View data structure

print("\nFirst 5 Rows of the Dataset:")
print(bbc_data.head())  # Look at the first 5 lines

# Step 3: Check for missing values
print("\nMissing Values in Each Column:")
print(bbc_data.isnull().sum())  # Verify that there are no missing values

# Step 4: Analyze the category distribution
print("\nCategory Distribution:")
print(bbc_data['category'].value_counts())  # View the number of articles in each category


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   text      2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB
None

First 5 Rows of the Dataset:
        category                                               text
0           tech  tv future in the hands of viewers with home th...
1       business  worldcom boss  left books alone  former worldc...
2          sport  tigers wary of farrell  gamble  leicester say ...
3          sport  yeading face newcastle in fa cup premiership s...
4  entertainment  ocean s twelve raids box office ocean s twelve...

Missing Values in Each Column:
category    0
text        0
dtype: int64

Category Distribution:
category
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64


In [2]:
# Second module: data preprocessing module
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Step 1: Define the preprocessing function
def preprocess_text(text):
    # 1. Convert to lowercase
    text = text.lower()
    
    # 2. Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 3. Tokenize (split text into words)
    words = word_tokenize(text)
    
    # 4. Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # 5. Lemmatization (convert words to their base forms)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # 6. Join the processed words back into a single string
    return ' '.join(words)

# Step 2: Apply preprocessing to the text column
bbc_data['clean_text'] = bbc_data['text'].apply(preprocess_text)

# Step 3: Inspect the cleaned text
print("Original Text Example:")
print(bbc_data['text'][0])
print("\nCleaned Text Example:")
print(bbc_data['clean_text'][0])


Original Text Example:
tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially  the technology allows for much more personalised tv. they are also 

In [3]:
# The third module: feature extraction module
# I will implement three feature extraction methods:

#1. Relative word frequency (TF-IDF).
#2. Text length: the total number of words in a news article.
#3. Average length of words.

from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: TF-IDF Feature Extraction
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit the maximum number of features to prevent sparsity
tfidf_features = tfidf_vectorizer.fit_transform(bbc_data['clean_text'])

# Convert the sparse matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Step 2: Text Length Feature
# Count the number of words in each document
bbc_data['text_length'] = bbc_data['clean_text'].apply(lambda x: len(x.split()))

# Step 3: Average Word Length Feature
# Calculate average word length per document
bbc_data['avg_word_length'] = bbc_data['clean_text'].apply(
    lambda x: sum(len(word) for word in x.split()) / len(x.split()) if len(x.split()) > 0 else 0
)

# Combine all features into a single DataFrame
features_df = pd.concat([tfidf_df, bbc_data[['text_length', 'avg_word_length']]], axis=1)

# Step 4: Inspect the final feature set
print("Features Shape:", features_df.shape)
print("Sample Features:")
print(features_df.head())


Features Shape: (2225, 5002)
Sample Features:
   aaa  abandoned  abc   ability  able  abn  abortion  abroad  absence  \
0  0.0        0.0  0.0  0.000000   0.0  0.0       0.0     0.0      0.0   
1  0.0        0.0  0.0  0.043504   0.0  0.0       0.0     0.0      0.0   
2  0.0        0.0  0.0  0.000000   0.0  0.0       0.0     0.0      0.0   
3  0.0        0.0  0.0  0.000000   0.0  0.0       0.0     0.0      0.0   
4  0.0        0.0  0.0  0.000000   0.0  0.0       0.0     0.0      0.0   

   absolute  ...  yuganskneftegas  yukos  yushchenko  zealand  zero  zombie  \
0       0.0  ...              0.0    0.0         0.0      0.0   0.0     0.0   
1       0.0  ...              0.0    0.0         0.0      0.0   0.0     0.0   
2       0.0  ...              0.0    0.0         0.0      0.0   0.0     0.0   
3       0.0  ...              0.0    0.0         0.0      0.0   0.0     0.0   
4       0.0  ...              0.0    0.0         0.0      0.0   0.0     0.0   

   zone  zurich  text_length  avg_

In [4]:
# The fourth module: feature selection module

# I used principal component analysis (PCA) to reduce the dimensionality of the features.

#PCA can effectively reduce the feature dimension while preserving the main information of the data as much as possible (i.e. maximizing the variance).

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Step 1: Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_df)

# Step 2: Apply PCA
# Specify the number of components or variance ratio
pca = PCA(n_components=0.95)  # Keep 95% of the variance
reduced_features = pca.fit_transform(scaled_features)

# Step 3: Inspect PCA results
print("Original Feature Dimensions:", features_df.shape[1])
print("Reduced Feature Dimensions:", reduced_features.shape[1])

# Step 4: Save the reduced feature set as a DataFrame
reduced_features_df = pd.DataFrame(reduced_features)

# Step 5: Explained Variance Ratio
print("Explained Variance Ratio by PCA Components:")
print(pca.explained_variance_ratio_)



Original Feature Dimensions: 5002
Reduced Feature Dimensions: 1578
Explained Variance Ratio by PCA Components:
[0.00610784 0.0049164  0.0046719  ... 0.000167   0.00016688 0.00016643]


In [9]:
# Fifth module: Model training and evaluation module
# I will do this in two parts:
#1. Model training: Support vector machine (SVM) is used for training.
#2. Model evaluation: The data is divided into 80% training set, 10% verification set and 10% test set to evaluate the model performance.

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np

# Step 1: Split the dataset
X = reduced_features_df  # Use features with reduced dimensions
y = bbc_data['category']  # Use the original category tag

# First split: training set (80%) and temporary set (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Second split: development set (10%) and test set (10%) from temporary set
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print("Training set size:", len(X_train))
print("Development set size:", len(X_dev))
print("Test set size:", len(X_test))

# Step 2: Train the SVM model
svm_model = SVC(kernel='linear', random_state=42)  # Using linear kernel
svm_model.fit(X_train, y_train)




Training set size: 1780
Development set size: 222
Test set size: 223


In [11]:
# Sixth module: Evaluation module
# Step 1: Evaluate on the development set
y_dev_pred = svm_model.predict(X_dev)

print("\nDevelopment Set Evaluation:")
print("Accuracy:", accuracy_score(y_dev, y_dev_pred))
print("\nClassification Report:")
print(classification_report(y_dev, y_dev_pred))

# Step 2: Final evaluation on the test set
y_test_pred = svm_model.predict(X_test)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))



Development Set Evaluation:
Accuracy: 0.972972972972973

Classification Report:
               precision    recall  f1-score   support

     business       1.00      0.98      0.99        51
entertainment       0.95      0.97      0.96        38
     politics       0.95      0.93      0.94        42
        sport       1.00      1.00      1.00        51
         tech       0.95      0.97      0.96        40

     accuracy                           0.97       222
    macro avg       0.97      0.97      0.97       222
 weighted avg       0.97      0.97      0.97       222


Test Set Evaluation:
Accuracy: 0.9730941704035875

Classification Report:
               precision    recall  f1-score   support

     business       0.96      0.96      0.96        51
entertainment       0.93      1.00      0.96        39
     politics       0.98      0.98      0.98        42
        sport       1.00      1.00      1.00        51
         tech       1.00      0.93      0.96        40

     accuracy 