### Importing the required libraries

In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

### Importing the datasets

In [2]:
df_true=pd.read_csv('./data/True.csv',encoding='ISO-8859-1')
df_fake=pd.read_csv('./data/Fake.csv',encoding='ISO-8859-1')

### Showing the top 5 rows of the datasets

In [3]:
#Top 5 rows of the true news dataset
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [4]:
#Top 5 rows of the fake news dataset
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Yearâ...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obamaâs Na...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


### Classifying the true news as 1 and fake news as 0

In [6]:
df_fake['class'] = 0
df_true['class'] = 1

In [7]:
#new true news dataset
df_true.head()

Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [8]:
#new fake news dataset
df_fake.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Yearâ...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obamaâs Na...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


### Finding out the size of both the datasets

In [9]:
#Finding out the no of rows and columns of the true news dataset
df_true.shape

(21417, 5)

#### The true news dataset has actually 5 diffrent columns referred as title of the news,text,subject date of publishing and class(objectified as 0 ) and 21417 rows referred as diffrent news observations.

In [10]:
df_fake.shape

(23481, 5)

#### The fake news dataset has actually 5 diffrent columns referred as title of the news,text,subject date of publishing and class(objectified as 1) and 23481 rows referred as diffrent news observations.

### Merging true and fake dataframes

In [11]:
df=pd.concat([df_true,df_fake])
df.head()

Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


#### We have merged the two datasets in a single dataset  

### Dropping the columns which are not required

In [12]:
df=df.drop(['title','subject','date'],axis=1)

#### We have drooped the 'Title','Subject' and 'Date' column from the dataset as they are iirelevant while classfying any news article.

### Finding out if there are any null values in the dataset

In [13]:
df.isnull().sum()

text     0
class    0
dtype: int64

#### The dataset does not contain any null values 

### Random Shuffling of the data

In [14]:
df=df.sample(frac=1)
df.head()

Unnamed: 0,text,class
5259,WASHINGTON (Reuters) - The White House on Sund...,1
10248,That s what we re talking about! Another campa...,0
13475,A protester at the RNC in Cleveland attempted ...,0
5289,WASHINGTON (Reuters) - The top Republican at t...,1
8192,"As of this writing, there are 4 heavily armed ...",0


In [15]:
#Indexing the rows of the dataset
df.reset_index(inplace = True)
df.head()

Unnamed: 0,index,text,class
0,5259,WASHINGTON (Reuters) - The White House on Sund...,1
1,10248,That s what we re talking about! Another campa...,0
2,13475,A protester at the RNC in Cleveland attempted ...,0
3,5289,WASHINGTON (Reuters) - The top Republican at t...,1
4,8192,"As of this writing, there are 4 heavily armed ...",0


In [16]:
#Renaming the columns
df = df.rename(columns={"index": "Index","text":"Text","class":"Class"})
df.head()



Unnamed: 0,Index,Text,Class
0,5259,WASHINGTON (Reuters) - The White House on Sund...,1
1,10248,That s what we re talking about! Another campa...,0
2,13475,A protester at the RNC in Cleveland attempted ...,0
3,5289,WASHINGTON (Reuters) - The top Republican at t...,1
4,8192,"As of this writing, there are 4 heavily armed ...",0


### Cleaning the text
* We will  convert the text to lowercase.
* We will remove the non word characters.
* Remove URL and HTML tags.
* We will remove all punctuation marks.
* Will removenewline characters are removed.
*  We will remove all words that contain numbers .

In [17]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]','',text)
    text = re.sub("\\W"," ",text)
    text = re.sub('https://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '',text)
    text = re.sub('[%s]' % re.escape(string.punctuation),'',text)
    text = re.sub('\n','',text)
    text = re.sub('\w*\d\w','',text)
    return text

In [18]:
#Applying the function the the text
df['Text'] = df['Text'].apply(wordopt)
df.head()

Unnamed: 0,Index,Text,Class
0,5259,washington reuters the white house on sund...,1
1,10248,that s what we re talking about another campa...,0
2,13475,a protester at the rnc in cleveland attempted ...,0
3,5289,washington reuters the top republican at t...,1
4,8192,as of this writing there are 4 heavily armed ...,0


### Defining the dependent and independent variable

In [20]:
x = df['Text']
y = df['Class']

### Randomly spliting the dataset into training and testing datset

In [21]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)

### Converting text to vector

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

### Model Building

#### Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(xv_train,y_train)

In [24]:
pred_lr = LR.predict(xv_test)
LR.score(xv_test, y_test)

0.9898440979955456

In [25]:
#finding out the R square value
score_LR=LR.score(xv_test,y_test)
print(score_LR)

0.9898440979955456


Here we can see that the coefficient of determination or the R square value of the model is 0.99 which is close to 1.So we can say that the model almost explains 99% variation in the data.

In [26]:
#report of classification
print(classification_report(y_test,pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5882
           1       0.99      0.99      0.99      5343

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



### Decision tree classifier

In [27]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(xv_train,y_train)

In [28]:
predicted_DT=DT.predict(xv_test)

In [29]:
#Finding out R square value
score_DT=DT.score(xv_test,y_test)
print(score_DT)

0.9974164810690423


Here also we can find the R square value using decision tree is 0.99 which is close to 1.This means the model explains 99% variation in the data.

In [30]:
#printing the classification report
print(classification_report(predicted_DT,y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5891
           1       1.00      1.00      1.00      5334

    accuracy                           1.00     11225
   macro avg       1.00      1.00      1.00     11225
weighted avg       1.00      1.00      1.00     11225



#### Gradient Boosting Classifier

In [31]:
from sklearn.ensemble import GradientBoostingClassifier
GBC=GradientBoostingClassifier()
GBC.fit(xv_train,y_train)

In [32]:
predicted_GBC=GBC.predict(xv_test)

In [33]:
# Finding out the R square value of the model
score=GBC.score(xv_test,y_test)
print(score)

0.9957238307349666


Here also we can find the R square value using decision tree is 0.99 which is close to 1.This means the model explains 99% variation in the data.

In [34]:
#Printing the classification report
print(classification_report(predicted_GBC,y_test))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5848
           1       1.00      0.99      1.00      5377

    accuracy                           1.00     11225
   macro avg       1.00      1.00      1.00     11225
weighted avg       1.00      1.00      1.00     11225



### Random forest classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier
RFC=RandomForestClassifier()
RFC.fit(xv_train,y_train)

In [36]:
predicted_RFC=RFC.predict(xv_test)

In [37]:
# Finding out the R square value of the model
score=RFC.score(xv_test,y_test)
print(score)

0.9942984409799555


Here also we can find the R square value using decision tree is 0.99 which is close to 1.This means the model explains 99% variation in the data.

In [38]:
#Printing the classification report
print(classification_report(predicted_RFC,y_test))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5856
           1       1.00      0.99      0.99      5369

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



#### Comparing diffrent models we can say that the performance of the Gradient Boosting Classifier classifies better that the rest of the models as it gives 100 % accuracy on the test data.

### Testing the model

In [39]:
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not a Fake News"
    
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt)
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GB = GBC.predict(new_xv_test)
    pred_RF = RFC.predict(new_xv_test)

    return print("\n\nLR Prediction: {} \nDT Prediction : {} \nGB Prediction : {} \nRF Prediction : {}".format(output_lable(pred_LR[0]),
                                                                                                               output_lable(pred_DT[0]),
                                                                                                               output_lable(pred_GB[0]),
                                                                                                               output_lable(pred_RF[0])))

Input News Article: JAKARTA (Reuters) - Indonesia will buy 11 Sukhoi fighter jets worth $1.14 billion from Russia in exchange for cash and Indonesian commodities, two cabinet ministers said on Tuesday. The Southeast Asian country has pledged to ship up to $570 million worth of commodities in addition to cash to pay for the Suhkoi SU-35 fighter jets, which are expected to be delivered in stages starting in two years

In [40]:
news = input(str())
manual_testing(news)



LR Prediction: Not a Fake News 
DT Prediction : Not a Fake News 
GB Prediction : Not a Fake News 
RF Prediction : Not a Fake News


 Input News Article: 21st Century Wire says As 21WIRE predicted in its new year s look ahead, we have a new  hostage  crisis underway.Today, Iranian military forces report that two small riverine U.S. Navy boats were seized in Iranian waters, and are currently being held on Iran s Farsi Island in the Persian Gulf. A total of 10 U.S. Navy personnel, nine men and one woman, have been detained by Iranian authorities.

In [41]:
news = input(str())
manual_testing(news)



LR Prediction: Fake News 
DT Prediction : Fake News 
GB Prediction : Fake News 
RF Prediction : Not a Fake News
