In [61]:
import numpy as np
import pandas as pd

In [62]:
true = pd.read_csv('truenewsdataset.csv')
fake = pd.read_csv('fakenewsdataset.csv')

In [63]:
true['text'].head()

0    Donald Trump just couldn t wish all Americans ...
1    House Intelligence Committee Chairman Devin Nu...
2    On Friday, it was revealed that former Milwauk...
3    On Christmas day, Donald Trump announced that ...
4    Pope Francis used his annual Christmas Day mes...
Name: text, dtype: object

In [64]:
fake.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [65]:
true['label'] = 1
fake['label'] = 0

In [66]:
news = pd.concat([fake, true], axis=0)

In [67]:
news.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [68]:
news = news.drop(['title','subject','date'], axis=1)

In [69]:
news = news.sample(frac=1)

In [70]:
news.reset_index(inplace=True)

In [71]:
news.drop(['index'], axis=1, inplace= True)
news.head()

Unnamed: 0,text,label
0,That s what we re talking about! Another campa...,1
1,WASHINGTON (Reuters) - Supporters of Obamacare...,0
2,Using state employees to fill a room for polit...,1
3,Every American should know the corrupt UN has ...,1
4,British comic Peter Serafinowicz has made ever...,1


In [72]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [73]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sudee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sudee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\sudee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sudee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [74]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [77]:
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespaces
    text = text.strip()
    return text

In [78]:
def preprocess_text(text):
    # Step 1: Clean text
    text = clean_text(text)
    # Step 2: Tokenize
    tokens = word_tokenize(text)
    # Step 3: Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [79]:
news['text'] = news['text'].apply(preprocess_text)

In [80]:
news['text'].head()

0    talking another campaign promise kept wonder d...
1    washington reuters supporter obamacare staged ...
2    using state employee fill room political purpo...
3    every american know corrupt un continue one or...
4    british comic peter serafinowicz made everybod...
Name: text, dtype: object

In [81]:
x = news['text']
y = news['label']

In [82]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier  # Import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [83]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [84]:
vectorizer = TfidfVectorizer(max_features=5000)
xv_train = vectorizer.fit_transform(x_train)
xv_test = vectorizer.transform(x_test)

# RANDOM FOREST MODEL

In [85]:
model = RandomForestClassifier(n_estimators=1000, random_state=42)
model.fit(xv_train, y_train)

In [86]:
# Predictions and accuracy
y_pred = model.predict(xv_test)
print("Fake News Detection Accuracy:", accuracy_score(y_test, y_pred))

Fake News Detection Accuracy: 0.9988121752041574


In [87]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6366
           1       1.00      1.00      1.00      7104

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



# EXPORTING THE MODEL

In [88]:
import pickle

In [89]:
with open('fakenewsmodel.pkl', 'wb') as file:
    pickle.dump(model, file)

# SYNTAX FOR IMPORTING THE NODEL

In [90]:
import pickle

# Load the model from a file
with open('fakenewsmodel.pkl', 'rb') as file:
    loaded_model = pickle.load(file)


In [93]:
import joblib

In [94]:
joblib.dump(model,"Newsdetection_joblib")

['Newsdetection_joblib']

In [95]:
news.head()

Unnamed: 0,text,label
0,talking another campaign promise kept wonder d...,1
1,washington reuters supporter obamacare staged ...,0
2,using state employee fill room political purpo...,1
3,every american know corrupt un continue one or...,1
4,british comic peter serafinowicz made everybod...,1


# MANUAL TESTING

In [96]:
def output_label(n):
    if n==0:
        return "news is fake"
    elif n==1:
        return "news is true"
    

In [97]:
def manual_testing(news):
    test_news = {"text":[news]}
    new_def_test = pd.DataFrame(test_news)
    new_def_test["text"] = new_def_test["text"].apply(preprocess_text) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorizer.transform(new_x_test)
    pred_rfm = loaded_model.predict(new_xv_test)
    return "Model prediction : {}".format(output_label(pred_rfm[0]))

In [98]:
news_art = str(input())

In [99]:
manual_testing(news_art)

'Model prediction : news is true'