In [13]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import joblib
from nltk.corpus import stopwords

In [14]:
# Step 2: Download necessary NLTK data
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')   
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [15]:
# Step 3: Load dataset
df = pd.read_csv("E:/Python/news_classifier_project/Data/bbc-text.csv")


# Rename columns if needed
df.columns = [c.lower().strip() for c in df.columns]
df = df.rename(columns={'article': 'text', 'category': 'label'})

# Display sample data
df.head()


Unnamed: 0,label,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [16]:
# Step 4: Clean and preprocess text
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)  # remove special characters
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and len(w) > 1]
    return " ".join(words)

df['clean_text'] = df['text'].apply(clean_text)
df[['text', 'clean_text', 'label']].head()


Unnamed: 0,text,clean_text,label
0,tv future in the hands of viewers with home th...,tv future hand viewer home theatre system plas...,tech
1,worldcom boss left books alone former worldc...,worldcom bos left book alone former worldcom b...,business
2,tigers wary of farrell gamble leicester say ...,tiger wary farrell gamble leicester say rushed...,sport
3,yeading face newcastle in fa cup premiership s...,yeading face newcastle fa cup premiership side...,sport
4,ocean s twelve raids box office ocean s twelve...,ocean twelve raid box office ocean twelve crim...,entertainment


In [17]:
# Step 5: Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.3, random_state=42, stratify=df['label']
)

print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))


Training samples: 1557
Testing samples: 668


In [18]:
# Step 6: Build ML pipeline
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=15000)),
    ('logreg', LogisticRegression(class_weight='balanced', solver='lbfgs', max_iter=2000, C=2.0))
])

# Train model
pipe.fit(X_train, y_train)


0,1,2
,steps,"[('tfidf', ...), ('logreg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,2.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,2000


In [19]:
# Step 7: Evaluate performance
y_pred = pipe.predict(X_test)

print("Accuracy:", round(accuracy_score(y_test, y_pred)*100, 2), "%")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 97.6 %

Classification Report:

               precision    recall  f1-score   support

     business       0.97      0.97      0.97       153
entertainment       0.97      0.98      0.97       116
     politics       0.97      0.96      0.96       125
        sport       0.99      1.00      1.00       154
         tech       0.97      0.97      0.97       120

     accuracy                           0.98       668
    macro avg       0.98      0.98      0.98       668
 weighted avg       0.98      0.98      0.98       668



In [20]:
# Step 8: Try a manual test
sample = ["The government passed a new law related to tax reforms."]
print("Predicted Topic:", pipe.predict(sample)[0])


Predicted Topic: politics


In [21]:
# Step 9: Save the trained pipeline
joblib.dump(pipe, "bbc_tfidf_pipeline.pkl")
print(" Model saved as bbc_tfidf_pipeline.pkl")


 Model saved as bbc_tfidf_pipeline.pkl


In [22]:
# Step 10: again to verify
model = joblib.load("bbc_tfidf_pipeline.pkl")
sample2 = ["India won the cricket world cup."]
print("Predicted Topic:", model.predict(sample2)[0])


Predicted Topic: sport


In [23]:
samples = [
    "The company announced a major merger deal worth $5 billion.",
    "Stock markets saw a sharp increase after the interest rate cut.",
    "The CEO said the new investment will boost economic growth."
]
for s in samples:
    print(s, "->", pipe.predict([s])[0])


The company announced a major merger deal worth $5 billion. -> business
Stock markets saw a sharp increase after the interest rate cut. -> business
The CEO said the new investment will boost economic growth. -> business


In [24]:
mixed_samples = [
    "The government passed a new law to reduce corporate taxes.",                       
    "The company announced record profits in the last quarter.",                         
    "The football team won the national championship after a dramatic penalty shootout.", 
    "A new smartphone with foldable screen technology was unveiled today.",               
    "The actor received an award for his outstanding performance in the movie.",         
    "The central bank decided to raise interest rates to control inflation.",             
    "The parliament voted in favor of the new education reform bill.",                    
    "The music festival attracted thousands of fans from around the world.",              
    "The tennis player advanced to the final after a straight-sets victory.",             
    "Experts warn that AI-driven cyberattacks are becoming more sophisticated."           
]

predictions = pipe.predict(mixed_samples)

for text, label in zip(mixed_samples, predictions):
    print(f"{label.upper():<13} | {text}")


BUSINESS      | The government passed a new law to reduce corporate taxes.
BUSINESS      | The company announced record profits in the last quarter.
SPORT         | The football team won the national championship after a dramatic penalty shootout.
TECH          | A new smartphone with foldable screen technology was unveiled today.
ENTERTAINMENT | The actor received an award for his outstanding performance in the movie.
BUSINESS      | The central bank decided to raise interest rates to control inflation.
POLITICS      | The parliament voted in favor of the new education reform bill.
ENTERTAINMENT | The music festival attracted thousands of fans from around the world.
SPORT         | The tennis player advanced to the final after a straight-sets victory.
BUSINESS      | Experts warn that AI-driven cyberattacks are becoming more sophisticated.
