In [1]:
import pandas as pd
import numpy as np

In [3]:
df_body = pd.read_csv('news body.csv')
df_heading = pd.read_csv('news heading.csv')

In [4]:
df_body.head()

Unnamed: 0,Body,Class
0,hnbs managing director ceo jonathan alles capa...,business
1,emirate announced resumption colombo male serv...,business
2,financial servicesheavy conglomerate lolc rega...,business
3,investor confidence colombo stock exchange rec...,business
4,sunshine holding diversified holding company i...,business


In [5]:
df_heading.head()

Unnamed: 0,Heading,Class
0,rebuilding future depends bank respond crisis ...,business
1,emirate resume colombo male flight st september,business
2,lolc reclaims crown valuable stock,business
3,cse total market turnover hit r billion within...,business
4,sunshine holding drive business value transfor...,business


In [6]:
df_heading["Heading"][0]

'rebuilding future depends bank respond crisis jonathan alles'

In [8]:
df_body["Body"][0]

'hnbs managing director ceo jonathan alles capacity chairman prestigious asian banker association aba premier platform asian bank inaugurated aba th general meeting conference initially planned hosted sri lanka year event taking place virtually august theme asian bank achieving sustainable growth new normal attended important figure banking across asian region addressing virtual gathering alles highlighted critical role bank played supporting economic recovery postcovid today bank across globe working de facto delivery system delivering cash need however clear end sight virus still rampant bank acutely aware respond crisis determine rebuild future providing support government customer crucial crisis require bank deploy tool developed since global financial crisis lead coherent extensive response unchartered territory one thing certain strong banking sector fundamental strong recovery alles also emphasized importance innovation technological improvement sector addressing broader market 

In [None]:
#Merge datasets using the positions since ID isn't given
dataset = pd.DataFrame({
    'heading': df_heading.iloc[:, 0],  
    'body': df_body.iloc[:, 0],         
    'category': df_heading.iloc[:, 1]   
})

In [11]:
dataset.head()

Unnamed: 0,heading,body,category
0,rebuilding future depends bank respond crisis ...,hnbs managing director ceo jonathan alles capa...,business
1,emirate resume colombo male flight st september,emirate announced resumption colombo male serv...,business
2,lolc reclaims crown valuable stock,financial servicesheavy conglomerate lolc rega...,business
3,cse total market turnover hit r billion within...,investor confidence colombo stock exchange rec...,business
4,sunshine holding drive business value transfor...,sunshine holding diversified holding company i...,business


In [14]:
dataset.shape

(800, 3)

Regex removal

In [16]:
import re

def clean_news_text(text):
    if pd.isna(text): return ""
    text = re.sub(r'\W', ' ', str(text))
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    return text.lower().strip()

dataset['clean_heading'] = dataset['heading'].fillna('').apply(clean_news_text)
dataset['clean_body'] = dataset['body'].fillna('').apply(clean_news_text)
dataset['full_context'] = dataset['clean_heading'] + ' ' + dataset['clean_body']

dataset = dataset[dataset['full_context'].str.len() > 50].reset_index(drop=True)
print(f"After filtering : {len(dataset)} clean articles")


After filtering : 799 clean articles


Vectorization


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=3000,          
    min_df=2,             # to avoid typo and words that appear < 2 articles          
    max_df=0.95,          # to remove constantly appearing words
    ngram_range=(1,2),    # allow 2 word phrases
    stop_words='english'        
)


In [20]:
X = vectorizer.fit_transform(dataset['full_context'])

Label Encoding

In [21]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(dataset['category'])

Model Training and Evaluation

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = LogisticRegression(multi_class='multinomial', max_iter=300).fit(X_train, y_train)

print(f"Accuracy: {clf.score(X_test, y_test):.1%}") 


Accuracy: 92.5%




Saving the model

In [25]:
import pickle

with open('classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)
    
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
    
with open('categories.pkl', 'wb') as f:
    pickle.dump(dataset['category'].unique(), f)

print("Models saved with pickle!")


Models saved with pickle!
