<a href="https://colab.research.google.com/github/Jeo-Jeo/project-News-classification/blob/main/News_Article_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install openpyxl




In [3]:
import pandas as pd

true_df = pd.read_excel("True.csv.xlsx", engine='openpyxl')
fake_df = pd.read_excel("Fake.csv.xlsx", engine='openpyxl')

true_df['label'] = 1
fake_df['label'] = 0

true_df = true_df[['text', 'label']]
fake_df = fake_df[['text', 'label']]

df = pd.concat([true_df, fake_df], ignore_index=True)
df.dropna(inplace=True)
df = df.sample(frac=1).reset_index(drop=True)

df.head()


Unnamed: 0,text,label
0,,0
1,The Democrats are fighting against closing our...,0
2,DUBLIN (Reuters) - Brexit negotiators in Bruss...,1
3,NEW YORK (Reuters) - A U.S. judge on Thursday ...,1
4,Stop allowing the progressives to drive God o...,0


In [4]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    text = str(text).lower()
    text = ''.join([c for c in text if c not in string.punctuation])
    tokens = text.split()
    tokens = [stemmer.stem(w) for w in tokens if w not in stop_words]
    return ' '.join(tokens)

df['cleaned'] = df['text'].apply(preprocess)
df[['text', 'cleaned']].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,text,cleaned
0,,
1,The Democrats are fighting against closing our...,democrat fight close border mexico everi step ...
2,DUBLIN (Reuters) - Brexit negotiators in Bruss...,dublin reuter brexit negoti brussel dublin lon...
3,NEW YORK (Reuters) - A U.S. judge on Thursday ...,new york reuter us judg thursday repeatedli pr...
4,Stop allowing the progressives to drive God o...,stop allow progress drive god land ben carson ...


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned'])

y = df['label']


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
X.shape  # (rows, features)


(44919, 5000)

In [8]:
X_train.shape, X_test.shape


((35935, 5000), (8984, 5000))

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9896482635796973
Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4736
           1       0.99      0.99      0.99      4248

    accuracy                           0.99      8984
   macro avg       0.99      0.99      0.99      8984
weighted avg       0.99      0.99      0.99      8984



In [10]:
text = "Government announces new employment policy."
cleaned = preprocess(text)
vec = tfidf.transform([cleaned])
prediction = model.predict(vec)[0]
print("Prediction:", "REAL" if prediction == 1 else "FAKE")


Prediction: FAKE


In [11]:
import pickle

pickle.dump(model, open("news_model.pkl", "wb"))
pickle.dump(tfidf, open("vectorizer.pkl", "wb"))

from google.colab import files
files.download("news_model.pkl")
files.download("vectorizer.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>