In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

In [3]:
true_news = pd.read_csv("True.csv")
fake_news = pd.read_csv("Fake.csv")

In [4]:
true_news.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
true_news.shape

(21417, 4)

In [6]:
fake_news.tail()

Unnamed: 0,title,text,subject,date
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016"
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016"
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016"
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016"
23480,10 U.S. Navy Sailors Held by Iranian Military ...,21st Century Wire says As 21WIRE predicted in ...,Middle-east,"January 12, 2016"


In [7]:
fake_news.shape

(23481, 4)

In [8]:
fake_news["label"]="1"
true_news["label"]="0"

In [9]:
# Concat the two datasets into single dataset
news = pd.concat([true_news,fake_news],ignore_index=True)

In [10]:
news.sample(n=5,random_state=28)

Unnamed: 0,title,text,subject,date,label
34765,WHY IS LORETTA LYNCH OPENING For One Of The Bi...,Loretta Lynch is head of the DOJ and is openin...,politics,"Aug 4, 2016",1
26939,NYPD SLAPS Trump Down Cold For Trying To Poli...,"There is nothing that Donald Trump, our reside...",News,"July 10, 2016",1
1963,U.S. states hit back at EPA chief over climate...,WASHINGTON (Reuters) - Democratic state offici...,politicsNews,"August 31, 2017",0
43842,"Boiler Room EP #80 – Heads They Win, Tails You...",Tune in to the Alternate Current Radio Network...,US_News,"October 26, 2016",1
33911,HYSTERICAL! TUCKER CARLSON Slams Geraldo For P...,,politics,"Nov 5, 2016",1


In [11]:
news.shape

(44898, 5)

In [12]:
news["subject"].unique()

array(['politicsNews', 'worldnews', 'News', 'politics', 'Government News',
       'left-news', 'US_News', 'Middle-east'], dtype=object)

In [13]:
# Train the dataset using text and title 
# Combine text and title into single column as content
news['content'] = news['title'].fillna('') + news['text'].fillna('')

In [14]:
news.head()

Unnamed: 0,title,text,subject,date,label,content
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0,"As U.S. budget fight looms, Republicans flip t..."
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0,U.S. military to accept transgender recruits o...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0,Senior U.S. Republican senator: 'Let Mr. Muell...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0,FBI Russia probe helped by Australian diplomat...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0,Trump wants Postal Service to charge 'much mor...


In [15]:
# Text Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

tfidf = TfidfVectorizer(stop_words='english',max_df=0.7)
x_text = tfidf.fit_transform(news['content'])
y = news['label']

In [16]:
# Split the data into train and test
x_train,x_test,y_train,y_test = train_test_split(x_text,y,test_size=0.8,random_state=42)

In [17]:
# Train model using logistic regression 
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression()
logistic_model.fit(x_train,y_train)
y_pred = logistic_model.predict(x_test)


In [18]:
# ClassificationReport
from sklearn.metrics import classification_report,accuracy_score

print("ClassificationReport:",classification_report(y_test,y_pred))
print("AccuracyScore:",accuracy_score(y_test,y_pred))

ClassificationReport:               precision    recall  f1-score   support

           0       0.96      0.97      0.97     17073
           1       0.97      0.97      0.97     18846

    accuracy                           0.97     35919
   macro avg       0.97      0.97      0.97     35919
weighted avg       0.97      0.97      0.97     35919

AccuracyScore: 0.9697931456889112


In [19]:
# Train using RandomForest
from sklearn.ensemble import RandomForestClassifier

x_train_r,x_test_r,y_train_r,y_test_r = train_test_split(x_text,y,test_size=0.2,random_state=42)

In [20]:
# Train model

rf_model = RandomForestClassifier()
rf_model.fit(x_train_r,y_train_r)
y_pred_r = rf_model.predict(x_test_r)

In [21]:
print("ClassificationReport_RFC:",classification_report(y_test_r,y_pred_r))
print("AccuracyScore_RFC:",accuracy_score(y_test_r,y_pred_r))

ClassificationReport_RFC:               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4330
           1       0.99      0.99      0.99      4650

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

AccuracyScore_RFC: 0.9928730512249443


In [22]:
# Train using SVM classifier
from sklearn.svm import SVC

# split the data 
x_train_s,x_test_s,y_train_s,y_test_s = train_test_split(x_text,y,test_size=0.2,random_state=42)

# Train model
svc_model = SVC(kernel='linear')
svc_model.fit(x_train_s,y_train_s)
y_pred = svc_model.predict(x_test_s)

In [26]:
print("ClassificationReport_SVC:",classification_report(y_test_s,y_pred))
print("AccuracyScore_SVC:",accuracy_score(y_test_s,y_pred))

ClassificationReport_SVC:               precision    recall  f1-score   support

           0       0.99      1.00      0.99      4330
           1       1.00      0.99      0.99      4650

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

AccuracyScore_SVC: 0.9942093541202672


In [38]:
# Deep Learning training use LSTM or BERT
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences, to_categorical
from keras.models import Sequential
from keras.layers import LSTM,Dense,Embedding

x = news['content']
y = news['label']

# Tokenization
token = Tokenizer(num_words=5000)
token.fit_on_texts(x)
x_seq = token.texts_to_sequences(x)
x_pad = pad_sequences(x_seq, maxlen=300)

In [48]:
# Split the data
x_train_ls,x_test_ls,y_train_ls,y_test_ls = train_test_split(x_pad,y,test_size=0.2,random_state=42)

In [54]:
print(set(y_train_ls))

{'1', '0'}


In [56]:
y_train_ls = y_train_ls.astype(int)
y_test_ls = y_test_ls.astype(int)

In [58]:
x_train_ls = np.array(x_train_ls)  # Must be numeric (e.g., padded sequences)
y_train_ls = np.array(y_train_ls,dtype='float32')  # Must be numeric (e.g., 0s and 1s)
y_test_ls = np.array(y_test_ls,dtype='float32') 

In [60]:
# LSTM Model
ls_model = Sequential()
ls_model.add(Embedding(input_dim=5000,output_dim=64,input_length=300))
ls_model.add(LSTM(64))
ls_model.add(Dense(1,activation='sigmoid'))

ls_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
ls_model.summary()

# Train model 
ls_model.fit(x_train_ls,y_train_ls,epochs=5,batch_size=64,validation_split=0.1)



Epoch 1/5
[1m506/506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 142ms/step - accuracy: 0.8821 - loss: 0.2647 - val_accuracy: 0.9819 - val_loss: 0.0671
Epoch 2/5
[1m506/506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 147ms/step - accuracy: 0.9858 - loss: 0.0506 - val_accuracy: 0.9880 - val_loss: 0.0532
Epoch 3/5
[1m506/506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 151ms/step - accuracy: 0.9912 - loss: 0.0311 - val_accuracy: 0.9894 - val_loss: 0.0407
Epoch 4/5
[1m506/506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 149ms/step - accuracy: 0.9868 - loss: 0.0383 - val_accuracy: 0.9861 - val_loss: 0.0468
Epoch 5/5
[1m506/506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 146ms/step - accuracy: 0.9823 - loss: 0.0508 - val_accuracy: 0.9680 - val_loss: 0.0921


<keras.src.callbacks.history.History at 0x21b14a29700>

In [64]:
# Evaluation
ls_loss,ls_accuracy = ls_model.evaluate(x_test_ls,y_test_ls)
print("LSTM accuracy:",ls_accuracy)

[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 34ms/step - accuracy: 0.9682 - loss: 0.0900
LSTM accuracy: 0.9658129215240479
