 1/ Load the Dataset

In [10]:
!pip install datasets  # if not installed

from datasets import load_dataset

# Load the 'ag_news' dataset from Hugging Face
dataset = load_dataset("wangrongsheng/ag_news")

# The dataset typically has 'train' and 'test' splits
train_dataset = dataset["train"]
test_dataset = dataset["test"]

# Convert them to pandas DataFrames
import pandas as pd

df_train = pd.DataFrame(train_dataset)
df_test = pd.DataFrame(test_dataset)

print("Training set shape:", df_train.shape)
print("Testing set shape:", df_test.shape)

# Peek at the data
print(df_train.head())


Training set shape: (120000, 2)
Testing set shape: (7600, 2)
                                                text  label
0  Wall St. Bears Claw Back Into the Black (Reute...      2
1  Carlyle Looks Toward Commercial Aerospace (Reu...      2
2  Oil and Economy Cloud Stocks' Outlook (Reuters...      2
3  Iraq Halts Oil Exports from Main Southern Pipe...      2
4  Oil prices soar to all-time record, posing new...      2


In [11]:
df_train.head()

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


In [12]:
df_train.describe()

Unnamed: 0,label
count,120000.0
mean,1.5
std,1.118039
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


2/ Clean the Dataset


In [13]:
print(df_train.isnull().sum())

text     0
label    0
dtype: int64


In [14]:
print(df_train['label'].value_counts())


label
2    30000
3    30000
1    30000
0    30000
Name: count, dtype: int64


3/  Preprocess Text

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit on training data only (avoid data leakage!)
X_train_tfidf = vectorizer.fit_transform(df_train['text'])
X_test_tfidf = vectorizer.transform(df_test['text'])

# Extract labels
y_train = df_train['label']
y_test = df_test['label']

print("Train TF-IDF shape:", X_train_tfidf.shape)
print("Test TF-IDF shape:", X_test_tfidf.shape)


Train TF-IDF shape: (120000, 5000)
Test TF-IDF shape: (7600, 5000)


**4/ Train Multiple Classifiers**

**- Logistic Regression**

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

logreg = LogisticRegression(random_state=42, max_iter=200)
logreg.fit(X_train_tfidf, y_train)

# Predict & Evaluate
y_pred_logreg = logreg.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Classification Report:\n", classification_report(y_test, y_pred_logreg))


Logistic Regression Accuracy: 0.9039473684210526
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.90      0.91      1900
           1       0.95      0.98      0.96      1900
           2       0.87      0.87      0.87      1900
           3       0.88      0.88      0.88      1900

    accuracy                           0.90      7600
   macro avg       0.90      0.90      0.90      7600
weighted avg       0.90      0.90      0.90      7600



**- Decision Tree**

In [17]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_tfidf, y_train)

y_pred_dt = dt.predict(X_test_tfidf)
print("\nDecision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))



Decision Tree Accuracy: 0.810921052631579
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.83      0.83      1900
           1       0.87      0.88      0.87      1900
           2       0.77      0.76      0.76      1900
           3       0.78      0.78      0.78      1900

    accuracy                           0.81      7600
   macro avg       0.81      0.81      0.81      7600
weighted avg       0.81      0.81      0.81      7600



**- XGBoost Boosting  **

In [18]:
!pip install xgboost  # if needed

from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=42)
xgb.fit(X_train_tfidf, y_train)

y_pred_xgb = xgb.predict(X_test_tfidf)
print("\nXGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))



XGBoost Accuracy: 0.8907894736842106
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.88      0.90      1900
           1       0.92      0.96      0.94      1900
           2       0.87      0.85      0.86      1900
           3       0.86      0.87      0.86      1900

    accuracy                           0.89      7600
   macro avg       0.89      0.89      0.89      7600
weighted avg       0.89      0.89      0.89      7600



**- K-Nearest Neighbors**

In [19]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_tfidf, y_train)

y_pred_knn = knn.predict(X_test_tfidf)
print("\nKNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))



KNN Accuracy: 0.8902631578947369
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.90      0.90      1900
           1       0.93      0.95      0.94      1900
           2       0.86      0.86      0.86      1900
           3       0.87      0.85      0.86      1900

    accuracy                           0.89      7600
   macro avg       0.89      0.89      0.89      7600
weighted avg       0.89      0.89      0.89      7600



**5/ Compare Results**

In [20]:
models_scores = {
    "LogReg": accuracy_score(y_test, y_pred_logreg),
    "DecisionTree": accuracy_score(y_test, y_pred_dt),
    "XGBoost": accuracy_score(y_test, y_pred_xgb),
    "KNN": accuracy_score(y_test, y_pred_knn)
}

print("Summary of Accuracy Scores:")
for model_name, score in models_scores.items():
    print(f"{model_name}: {score:.4f}")


Summary of Accuracy Scores:
LogReg: 0.9039
DecisionTree: 0.8109
XGBoost: 0.8908
KNN: 0.8903


**6/ Evaluate Models using Precision, Recall, F1**

In [26]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Decision Tree
precision_dt = precision_score(y_test, y_pred_dt, average='macro')
recall_dt = recall_score(y_test, y_pred_dt, average='macro')
f1_dt = f1_score(y_test, y_pred_dt, average='macro')

print("\nDecision Tree Metrics (Macro-Averaged):")
print(f"Precision: {precision_dt:.4f}")
print(f"Recall:    {recall_dt:.4f}")
print(f"F1-score:  {f1_dt:.4f}")

# Logistic Regression
precision_lr = precision_score(y_test, y_pred_logreg, average='macro')
recall_lr = recall_score(y_test, y_pred_logreg, average='macro')
f1_lr = f1_score(y_test, y_pred_logreg, average='macro')

print("\nLogistic Regression Metrics (Macro-Averaged):")
print(f"Precision: {precision_lr:.4f}")
print(f"Recall:    {recall_lr:.4f}")
print(f"F1-score:  {f1_lr:.4f}")

# XGBoost
precision_xgb = precision_score(y_test, y_pred_xgb, average='macro')
recall_xgb = recall_score(y_test, y_pred_xgb, average='macro')
f1_xgb = f1_score(y_test, y_pred_xgb, average='macro')

print("\nXGBoost Metrics (Macro-Averaged):")
print(f"Precision: {precision_xgb:.4f}")
print(f"Recall:    {recall_xgb:.4f}")
print(f"F1-score:  {f1_xgb:.4f}")

# K-Nearest Neighbors
precision_knn = precision_score(y_test, y_pred_knn, average='macro')
recall_knn = recall_score(y_test, y_pred_knn, average='macro')
f1_knn = f1_score(y_test, y_pred_knn, average='macro')

print("\nKNN Metrics (Macro-Averaged):")
print(f"Precision: {precision_knn:.4f}")
print(f"Recall:    {recall_knn:.4f}")
print(f"F1-score:  {f1_knn:.4f}")



Decision Tree Metrics (Macro-Averaged):
Precision: 0.8106
Recall:    0.8109
F1-score:  0.8108

Logistic Regression Metrics (Macro-Averaged):
Precision: 0.9036
Recall:    0.9039
F1-score:  0.9037

XGBoost Metrics (Macro-Averaged):
Precision: 0.8907
Recall:    0.8908
F1-score:  0.8906

KNN Metrics (Macro-Averaged):
Precision: 0.8899
Recall:    0.8903
F1-score:  0.8900


In [27]:
summary = {
    "Decision Tree": [precision_dt, recall_dt, f1_dt],
    "Logistic Reg.": [precision_lr, recall_lr, f1_lr],
    "XGBoost":       [precision_xgb, recall_xgb, f1_xgb],
    "KNN":           [precision_knn, recall_knn, f1_knn]
}

import pandas as pd
results_df = pd.DataFrame(summary, index=["Precision", "Recall", "F1-score"]).T
print("\nComparison of Models (Macro-Average):\n", results_df)


Comparison of Models (Macro-Average):
                Precision    Recall  F1-score
Decision Tree   0.810628  0.810921  0.810761
Logistic Reg.   0.903634  0.903947  0.903709
XGBoost         0.890722  0.890789  0.890584
KNN             0.889900  0.890263  0.890000
