# This project builds a simple machine learning model that can classify news articles into categories such as business, technology, or sports.
We will load the dataset, clean it, train three models, compare their results, and save the best model for future use.

In [26]:
# Install and import the libraries we need
!pip install pandas scikit-learn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle




In [27]:
# Load the train and test data from GitHub
train_url = 'https://raw.githubusercontent.com/DareSandtech/2501PTDS_Classification_Project/main/Data/processed/train.csv'
test_url = 'https://raw.githubusercontent.com/DareSandtech/2501PTDS_Classification_Project/main/Data/processed/test.csv'

df = pd.read_csv(train_url)
test_df = pd.read_csv(test_url)

print(df.head())
print("Shape of dataset:", df.shape)


                                           headlines  \
0  RBI revises definition of politically-exposed ...   
1  NDTV Q2 net profit falls 57.4% to Rs 5.55 cror...   
2  Akasa Air ‘well capitalised’, can grow much fa...   
3  India’s current account deficit declines sharp...   
4  States borrowing cost soars to 7.68%, highest ...   

                                         description  \
0  The central bank has also asked chairpersons a...   
1  NDTV's consolidated revenue from operations wa...   
2  The initial share sale will be open for public...   
3  The current account deficit (CAD) was 3.8 per ...   
4  The prices shot up reflecting the overall high...   

                                             content  \
0  The Reserve Bank of India (RBI) has changed th...   
1  Broadcaster New Delhi Television Ltd on Monday...   
2  Homegrown server maker Netweb Technologies Ind...   
3  India’s current account deficit declined sharp...   
4  States have been forced to pay through thei

checking what categories are available and whether there are any missing values.

In [28]:
# Check the categories and for missing values
print(df['category'].value_counts())
print(df.isnull().sum())


category
education        1520
technology       1280
business         1120
entertainment     960
sports            640
Name: count, dtype: int64
headlines      0
description    0
content        0
url            0
category       0
dtype: int64


# Step 4: Combine and Clean the Text


We will combine the headlines and content columns into a single text column.
This helps the model learn from both short summaries and the full article text.

In [29]:
# Combine headline and content into one text column (no fillna needed)
df['text'] = df['headlines'] + ' ' + df['content']

# Keep only what we need for modeling
df = df[['text', 'category']]

# Quick preview
df.head()


Unnamed: 0,text,category
0,RBI revises definition of politically-exposed ...,business
1,NDTV Q2 net profit falls 57.4% to Rs 5.55 cror...,business
2,"Akasa Air ‘well capitalised’, can grow much fa...",business
3,India’s current account deficit declines sharp...,business
4,"States borrowing cost soars to 7.68%, highest ...",business


In [30]:
 # Step 5: Split the data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    df['text'], df['category'], test_size=0.2, random_state=42
)

print("Training samples:", len(X_train))
print("Validation samples:", len(X_valid))


Training samples: 4416
Validation samples: 1104


# Step 6: Convert Text to Numbers (TF-IDF)

Now we will change our text data into numbers using TF-IDF.  This  transforms your  text articles into numerical features that can be fed into machine learning models.


In [31]:
# Convert text to numeric features using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_valid_vec = vectorizer.transform(X_valid)

print("Vectorized shape:", X_train_vec.shape)


Vectorized shape: (4416, 5000)


# Step 7: Train the Models

Now we will train three different models:
- Logistic Regression
- Naive Bayes
- Random Forest

We will compare their accuracy to see which one performs best.


In [32]:
# Train three models and check their performance
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create the models
log_reg = LogisticRegression(max_iter=1000)
nb = MultinomialNB()
rf = RandomForestClassifier()

# Train each model
log_reg.fit(X_train_vec, y_train)
nb.fit(X_train_vec, y_train)
rf.fit(X_train_vec, y_train)

# Make predictions
log_reg_pred = log_reg.predict(X_valid_vec)
nb_pred = nb.predict(X_valid_vec)
rf_pred = rf.predict(X_valid_vec)

# Check accuracy and classification report
print("Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_valid, log_reg_pred))
print(classification_report(y_valid, log_reg_pred))

print("\nNaive Bayes Results:")
print("Accuracy:", accuracy_score(y_valid, nb_pred))
print(classification_report(y_valid, nb_pred))

print("\nRandom Forest Results:")
print("Accuracy:", accuracy_score(y_valid, rf_pred))
print(classification_report(y_valid, rf_pred))


Logistic Regression Results:
Accuracy: 0.9827898550724637
               precision    recall  f1-score   support

     business       0.97      0.98      0.97       245
    education       1.00      0.99      0.99       274
entertainment       1.00      0.99      0.99       178
       sports       0.98      0.99      0.98       137
   technology       0.97      0.97      0.97       270

     accuracy                           0.98      1104
    macro avg       0.98      0.98      0.98      1104
 weighted avg       0.98      0.98      0.98      1104


Naive Bayes Results:
Accuracy: 0.9827898550724637
               precision    recall  f1-score   support

     business       0.98      0.96      0.97       245
    education       0.99      1.00      0.99       274
entertainment       0.98      0.99      0.99       178
       sports       0.98      0.99      0.99       137
   technology       0.98      0.97      0.98       270

     accuracy                           0.98      1104
    ma

# Understanding the Results

From the results above, we can see that both **Logistic Regression** and **Naive Bayes**
performed almost the same with around **98% accuracy**.  
This means both models are doing great at classifying the news articles correctly.

**Random Forest** had a slightly lower accuracy (around 96%),
which means it was not as strong for this type of text data.

Since Logistic Regression and Naive Bayes give very similar results,
we can choose either of them as our final model.  



# Step 8: Save the Best Model

Now that we know which model performs the best(LOG_REG), we will save it so we can use it later.  
We will also save the TF-IDF vectorizer because it is needed to change new text
into numbers in the same way as our training data.


In [35]:
# Save the best model and the TF-IDF vectorizer

import pickle

# Logistic Regression performed best, so we will save it
best_model = log_reg

# Save the model
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Save the TF-IDF vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print("Best model and vectorizer have been saved successfully!")


Best model and vectorizer have been saved successfully!


# Step 9: Conclusion

In this project, we built a simple text classification model for news articles.

We trained three models:
- Logistic Regression  
- Naive Bayes  
- Random Forest  

After testing and comparing the results, we found that **Logistic Regression** gave the best accuracy.

We then saved the best model and the TF-IDF vectorizer so that we can use them later in a Streamlit web app
to make predictions on new news articles.


# Step 10: Test the Saved Model

Now we will load the saved model and TF-IDF vectorizer to test them.  
We will give the model a new example of a news article and see which category it predicts.


In [34]:
# Load the saved model and vectorizer
import pickle

with open('best_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

with open('vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)

# Example of a new article to test
new_text = ["The stock market saw record growth as investors gained confidence in the economy."]

# Convert the new text to numbers using the saved vectorizer
new_text_vec = loaded_vectorizer.transform(new_text)

# Make a prediction
prediction = loaded_model.predict(new_text_vec)

print("Predicted category:", prediction[0])


Predicted category: business
