In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

In [2]:
# Import the dataset for fake and true news
data_fake = pd.read_csv("Fake.csv")
data_true = pd.read_csv("True.csv")

In [3]:
# Display the first few rows of the fake news dataset
data_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
# Set class labels: Fake as 0 and True as 1
data_fake["class"] = 0
data_true["class"] = 1

In [5]:
# Check the shape (number of rows and columns) of the datasets
data_fake.shape, data_true.shape

((23481, 5), (21417, 5))

In [6]:
#Removing last 10 rows from the dataset
data_fake_manual_testing = data_fake.tail (10)
for i in range (23470, 23460, -1):
    data_fake.drop([i], axis = 0, inplace = True)
    
data_true_manual_testing = data_true.tail (10)
for i in range (21406, 21396, -1):
    data_true.drop([i], axis = 0, inplace = True)

In [7]:
# Check the updated shape of both datasets
data_fake.shape, data_true.shape

((23471, 5), (21407, 5))

In [8]:
# Assign class labels to the manual testing data
data_fake_manual_testing["class"] = 0
data_true_manual_testing["class"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_fake_manual_testing["class"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_true_manual_testing["class"] = 1


In [9]:
# Concatenate the fake and true datasets into one
data_merge = pd.concat([data_fake, data_true], axis = 0)
data_merge.tail(10)

Unnamed: 0,title,text,subject,date,class
21407,"Mata Pires, owner of embattled Brazil builder ...","SAO PAULO (Reuters) - Cesar Mata Pires, the ow...",worldnews,"August 22, 2017",1
21408,"U.S., North Korea clash at U.N. forum over nuc...",GENEVA (Reuters) - North Korea and the United ...,worldnews,"August 22, 2017",1
21409,"U.S., North Korea clash at U.N. arms forum on ...",GENEVA (Reuters) - North Korea and the United ...,worldnews,"August 22, 2017",1
21410,Headless torso could belong to submarine journ...,COPENHAGEN (Reuters) - Danish police said on T...,worldnews,"August 22, 2017",1
21411,North Korea shipments to Syria chemical arms a...,UNITED NATIONS (Reuters) - Two North Korean sh...,worldnews,"August 21, 2017",1
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1


In [10]:
# Check the shape of the merged dataset
data_merge.shape

(44878, 5)

In [11]:
# Count the number of samples in each class
data_merge['class'].value_counts()

0    23471
1    21407
Name: class, dtype: int64

In [12]:
# List the columns in the merged dataset
data_merge.columns

Index(['title', 'text', 'subject', 'date', 'class'], dtype='object')

In [13]:
# Remove unnecessary columns like 'title', 'subject', and 'date'
data = data_merge.drop(['title','subject','date'], axis = 1)

In [14]:
# Check for missing values in the dataset
data.isnull().sum()

text     0
class    0
dtype: int64

In [15]:
# Shuffle the dataset to randomize the order
data = data.sample(frac = 1)

In [16]:
data.head()

Unnamed: 0,text,class
19725,NEW YORK (Reuters) - U.S. President Donald Tru...,1
291,(Reuters) - The tax overhaul legislation passe...,1
4560,"In a perfect democracy, there would be multipl...",0
20251,"COX S BAZAR, Bangladesh (Reuters) - Pressure m...",1
15714,Rate your mom for Mother s Day sounds like the...,0


In [17]:
data.isnull().sum()

text     0
class    0
dtype: int64

In [18]:
# Reset the index of the dataset and remove the old index column
data.reset_index(inplace = True)
data.drop(['index'],axis = 1, inplace = True)

In [19]:
data.head()

Unnamed: 0,text,class
0,NEW YORK (Reuters) - U.S. President Donald Tru...,1
1,(Reuters) - The tax overhaul legislation passe...,1
2,"In a perfect democracy, there would be multipl...",0
3,"COX S BAZAR, Bangladesh (Reuters) - Pressure m...",1
4,Rate your mom for Mother s Day sounds like the...,0


In [23]:
# Define a function to clean the text data
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [26]:
# Measure the time it takes to clean the text data in the 'text' column
%%time
data['text_cleaned'] = data['text'].apply(wordopt)

CPU times: total: 16.5 s
Wall time: 33.7 s


In [27]:
data.head()

Unnamed: 0,text,class,text_cleaned
0,NEW YORK (Reuters) - U.S. President Donald Tru...,1,new york reuters u s president donald tru...
1,(Reuters) - The tax overhaul legislation passe...,1,reuters the tax overhaul legislation passe...
2,"In a perfect democracy, there would be multipl...",0,in a perfect democracy there would be multipl...
3,"COX S BAZAR, Bangladesh (Reuters) - Pressure m...",1,cox s bazar bangladesh reuters pressure m...
4,Rate your mom for Mother s Day sounds like the...,0,rate your mom for mother s day sounds like the...


In [28]:
# Define the input (x) and target (y) variables
x = data['text']
y = data['class']

In [29]:
x.head(10)

0    NEW YORK (Reuters) - U.S. President Donald Tru...
1    (Reuters) - The tax overhaul legislation passe...
2    In a perfect democracy, there would be multipl...
3    COX S BAZAR, Bangladesh (Reuters) - Pressure m...
4    Rate your mom for Mother s Day sounds like the...
5    An Asian student reporter on assignment for ES...
6    This is just too much! Protesting the flag tha...
7    The mayor of Charlottesville, Virginia confirm...
8    LONDON (Reuters) - Nine months after Prime Min...
9    President Trump thanked supporters who had gat...
Name: text, dtype: object

In [30]:
y.head()

0    1
1    1
2    0
3    1
4    0
Name: class, dtype: int64

In [32]:
# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x,y,  test_size = 0.25)

In [33]:
# Import the TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
vectorization = TfidfVectorizer()

# Transform the training and testing text data into TF-IDF vectors
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [34]:
# Import the Logistic Regression model
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
LR = LogisticRegression()

# Train the Logistic Regression model on the training data
LR.fit(xv_train, y_train)

CPU times: total: 3.86 s
Wall time: 2.84 s


In [35]:
# Make predictions using the trained Logistic Regression model on the test data
pred_lr = LR.predict(xv_test)

In [36]:
# Calculate and print the accuracy score of the Logistic Regression model
LR.score(xv_test, y_test)

0.9867201426024955

In [37]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5853
           1       0.98      0.99      0.99      5367

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [38]:
# Import the Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

In [39]:
pred_dt = DT.predict(xv_test)

In [40]:
DT.score(xv_test, y_test)

0.9954545454545455

In [41]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      5853
           1       0.99      1.00      1.00      5367

    accuracy                           1.00     11220
   macro avg       1.00      1.00      1.00     11220
weighted avg       1.00      1.00      1.00     11220



In [42]:
# Import the Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

GB = GradientBoostingClassifier(random_state = 0)
GB.fit(xv_train, y_train)

In [43]:
pred_gb = GB.predict(xv_test)

In [44]:
GB.score(xv_test, y_test)

0.9942959001782531

In [45]:
print(classification_report(y_test, pred_gb))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      5853
           1       0.99      1.00      0.99      5367

    accuracy                           0.99     11220
   macro avg       0.99      0.99      0.99     11220
weighted avg       0.99      0.99      0.99     11220



In [None]:
# Import the Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(random_state = 0)
RF.fit(xv_train, y_train)

In [None]:
pred_rf = RF.predict(xv_test)

In [None]:
RF.score(xv_test, y_test)

In [None]:
print(classification_report(y_test, pred_rf))

In [None]:
# Define a function for manual testing of news articles
import requests
from bs4 import BeautifulSoup

def manual_testing(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        article_text = ""
        for paragraph in soup.find_all('p'):
            article_text += paragraph.get_text()
        article_text = wordopt(article_text)
        new_xv_text = vectorization.transform([article_text])
        pred_LR = LR.predict(new_xv_text)
        pred_DT = DT.predict(new_xv_text)
        pred_GB = GB.predict(new_xv_text)
        pred_RF = RF.predict(new_xv_text)
        
        return print("\n\nLR Prediction: {} \nDT Prediction: {} \nGB Prediction: {} \nRF Prediction: {}".format(output_label(pred_LR[0]),
                                                                                                                output_label(pred_DT[0]),
                                                                                                                output_label(pred_GB[0]),
                                                                                                                output_label(pred_RF[0])))
    else:
        return "Failed to fetch the article. Check the URL."

In [None]:
# Input a URL for manual testing
url = input("Enter the URL of the news article: ")
manual_testing(url)

In [None]:
# Define a function for manual testing of news text
def manual_testing(news):
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt)
    new_x_test = new_def_test["text"]
    new_xv_text = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_text)
    pred_DT = DT.predict(new_xv_text)
    pred_GB = GB.predict(new_xv_text)
    pred_RF = RF.predict(new_xv_text)
    
    return print("\n\nLR Prediction: {} \nDT Prediction: {} \nGB Prediction: {} \nRF Prediction: {}" .format(output_label(pred_LR[0]),
                                                                                                            output_label(pred_DT[0]),
                                                                                                            output_label(pred_GB[0]),
                                                                                                            output_label(pred_RF[0])))

In [None]:
# Input news text for manual testing
news = str(input())
manual_testing(news)