In [1]:
import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import re
import string 
import nltk
from collections import Counter

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib

import xgboost as xgb

from config import *

# 1. Read dataset

In [2]:
PATH_PROCESSED_FILE = os.path.join("data", "bbc_news_data_processed.csv")

df = pd.read_csv(PATH_PROCESSED_FILE)
print(f"Shape of df: {df.shape}")
df.head()

Shape of df: (1490, 3)


Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom launch defenc lawyer defend former wo...,business
1,154,german busi confid slide german busi confid fe...,business
2,1101,bbc poll indic econom citizen major nation sur...,business
3,1976,lifestyl govern mobil choic faster better hard...,tech
4,917,enron boss payout eighteen former enron direct...,business


# 2. Extract data

In [3]:
TEST_SIZE = 0.2

In [4]:
# X = df['Text'].str.split()
X = df['Text']
y = np.array((df['Category']))

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

Shape of X: (1490,)
Shape of y: (1490,)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=42)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (1192,)
Shape of y_train: (1192,)
Shape of X_test: (298,)
Shape of y_test: (298,)


In [6]:
# Save the label encoder and its inverse transform to a file
joblib.dump(label_encoder, LABEL_ENCODER_NAME)

# label_encoder = joblib.load('label_encoder.joblib')

['models/label_encoder.joblib']

# 3. Train model

In [7]:
num_classes = int(NUM_CLASSES)
model = make_pipeline(TfidfVectorizer(), xgb.XGBClassifier(objective='multi:softmax', num_class=num_classes))

In [8]:
model.fit(X_train, y_train)

# 4. Evaluation

## 4.1 Evaluation on test set

In [9]:
predictions = model.predict(X_test)

In [10]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

# Print classification report for more detailed metrics
print("Classification Report:")
print(classification_report(y_test, predictions))

Accuracy: 0.9630872483221476
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.95        75
           1       0.96      0.98      0.97        46
           2       0.95      0.98      0.96        56
           3       1.00      1.00      1.00        63
           4       0.98      0.88      0.93        58

    accuracy                           0.96       298
   macro avg       0.96      0.96      0.96       298
weighted avg       0.96      0.96      0.96       298



## 4.2. Test on new sample

In [11]:
test_sample = r"This is the information about CR7. He is an excellent football player"

In [12]:
pred_label = model.predict([test_sample])[0]
pred_label = label_encoder.classes_[pred_label]
print(f"Predicted: {pred_label}")

Predicted: sport


# 5. Save model

In [13]:
# Save the model to disk 
joblib.dump(model, PATH_TRAINED_XGB)

['models/trained_xgb.joblib']

In [15]:
from comet_ml import Experiment

comet_api_key = r"j3DrC3ChXkR42WfPCUh5EIkye"

# Initialize Comet experiment
experiment = Experiment(api_key=comet_api_key)


[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/home/thaiv7/Desktop/project_news_classify' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/thaiv7/general/bc7ffef487dc44e888e3f909eb4fdfc2



In [16]:
# Assuming you have a trained model called 'trained_model'
# Replace 'trained_model' with your actual trained model

# Log the trained model to Comet
experiment.log_model("xgb", PATH_TRAINED_XGB)

{'web': 'https://www.comet.com/api/asset/download?assetId=a95b3fc23a554df98f05b136aa6a007b&experimentKey=bc7ffef487dc44e888e3f909eb4fdfc2',
 'api': 'https://www.comet.com/api/rest/v2/experiment/asset/get-asset?assetId=a95b3fc23a554df98f05b136aa6a007b&experimentKey=bc7ffef487dc44e888e3f909eb4fdfc2',
 'assetId': 'a95b3fc23a554df98f05b136aa6a007b'}

In [17]:
# Log metrics
experiment.log_metric("accuracy", accuracy)

In [18]:
experiment.end()

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/thaiv7/general/bc7ffef487dc44e888e3f909eb4fdfc2
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy : 0.9630872483221476
[1;38;5;39mCOMET INFO:[0m   Uploads:
[1;38;5;39mCOMET INFO:[0m     conda-environment-definition : 1
[1;38;5;39mCOMET INFO:[0m     conda-info                   : 1
[1;38;5;39mCOMET INFO:[0m     conda-specification          : 1
[1;38;5;39mCOMET INFO:[0m     environment details          : 1
[1;38;5;39mCOMET INFO:[0m     filename                     : 1
[1;38;5;39mCOMET INFO:[0m