In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
data = pd.read_csv("train.csv")  
data.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [None]:
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    words = text.split()
    filtered_words = [ps.stem(word) for word in words if word not in stop_words]
    return " ".join(filtered_words)

data['clean_text'] = data['Description'].apply(preprocess_text)

In [None]:

X = data['clean_text']
y = data['Class Index'] 
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape

((96000, 55973), (24000, 55973))

In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.895625

Classification Report:

              precision    recall  f1-score   support

           1       0.91      0.89      0.90      5956
           2       0.94      0.97      0.96      6058
           3       0.85      0.86      0.86      5911
           4       0.88      0.86      0.87      6075

    accuracy                           0.90     24000
   macro avg       0.90      0.90      0.90     24000
weighted avg       0.90      0.90      0.90     24000



In [None]:

new_text = ["AOL Properties Sign Girafa For Thumbnail Search Images"]
new_text_clean = [preprocess_text(new_text[0])]
new_text_tfidf = tfidf.transform(new_text_clean)

prediction = model.predict(new_text_tfidf)
print("Predicted Category:", prediction[0])


Predicted Category: 4


In [None]:
test_data = pd.read_csv("test.csv")
test_data.head()

Unnamed: 0,Class Index,Title,Description
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...


In [None]:
test_data['Description'] = test_data['Description'].fillna('')
test_data['clean_text'] = test_data['Description'].apply(preprocess_text)
X_test_tfidf = tfidf.transform(test_data['clean_text'])

print("Shape of preprocessed test data (TF-IDF vectors):")
print(X_test_tfidf.shape)

Shape of preprocessed test data (TF-IDF vectors):
(4319, 55973)


In [None]:
y_test_true = test_data['Class Index']
y_pred_test = model.predict(X_test_tfidf)

print("Accuracy on test data:", accuracy_score(y_test_true, y_pred_test))
print("\nClassification Report on test data:\n")
print(classification_report(y_test_true, y_pred_test))

Accuracy on test data: 0.8879370224589025

Classification Report on test data:

              precision    recall  f1-score   support

           1       0.90      0.88      0.89      1107
           2       0.94      0.97      0.95      1082
           3       0.84      0.86      0.85      1041
           4       0.87      0.85      0.86      1089

    accuracy                           0.89      4319
   macro avg       0.89      0.89      0.89      4319
weighted avg       0.89      0.89      0.89      4319

