In [54]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib 



In [55]:
true=pd.read_csv('True.csv') #To import the files
fake=pd.read_csv('Fake.csv')
true['label']=1  #Labeling the data 
fake['label']=0
news=pd.concat([fake,true],axis=0)  #merging them row wise
#news.isnull().sum()   To check if it has any null values
news=news.drop(['title','subject','date'],axis=1) #To drop the values such as title ,subject,date
news=news.sample(frac=1)   #To shuffle the data i.e both the news
news.reset_index(inplace=True)  #To reset the index number 
news.drop(['index'],axis=1,inplace=True) #To drop the index column to get the data in order


In [56]:
#Function for text module to make it readable for ml algo

def wordopt (text):
    text=text.lower()   #To convert the text into lowercase
    text=re.sub(r'https?://\S+|www\.\S+','',text)   #To Remove the URL's and to replace it with nothing (url is taken from the data)
    text=re.sub(r'<.*?>','',text)   #HTML Tags
    text=re.sub(r'[^\w\s]','',text)  #Punctuation
    text=re.sub(r'\d','',text)  #Digits 
    text=re.sub(r'\n',' ',text)  #Newline characters replaced with space
    return text

news['text']=news['text'].apply(wordopt)

x=news['text']
y=news['label']

In [57]:
#Vectors to divide the data into training and testing using scikit-learn library

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

#Now to convert the text to numerical values

from sklearn.feature_extraction.text import TfidfVectorizer
vectorization=TfidfVectorizer()
xv_train=vectorization.fit_transform(x_train)
xv_test=vectorization.transform(x_test)

In [58]:
# ML Model(Logistic Regression)
from sklearn.linear_model import LogisticRegression
LR=LogisticRegression()
LR.fit(xv_train,y_train)
pred_lr=LR.predict(xv_test)
LR.score(xv_test,y_test)
print(classification_report(y_test,pred_lr))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7109
           1       0.98      0.99      0.99      6361

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [59]:
#ML Model (RandomForestClassifier)
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
rfc.fit(xv_train,y_train)
predict_rfc=rfc.predict(xv_test)
rfc.score(xv_test,y_test)
print(classification_report(y_test,predict_rfc)) 

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      7109
           1       0.98      0.99      0.98      6361

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [60]:
#Creating an ML model ( GradientBoostingClassifier)

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
gbc = GradientBoostingClassifier()
gbc.fit(xv_train,y_train)
pred_gbc=gbc.predict(xv_test)


In [61]:
pred_gbc

array([0, 1, 1, ..., 0, 1, 1], dtype=int64)

In [62]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred_gbc)

0.9945805493689681

In [63]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred_gbc)

array([[7057,   52],
       [  21, 6340]], dtype=int64)

In [64]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pred_gbc))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      7109
           1       0.99      1.00      0.99      6361

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [65]:
def output_label(n):
    if n==0:
        return "It is a Fake News"
    elif n==1:
        return "It is a Genuine News"

In [66]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import joblib 
def manual_testing(news: str):
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt)
    new_x_test=new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_gbc = gbc.predict(new_xv_test)
    pred_rfc=rfc.predict(new_xv_test)
   
    result= "\n\nLR Prediction: {} \nGBC Prediction: {} \nRFC Predtion: {}".format(output_label(pred_lr[0]), output_label(pred_gbc[0]), output_label(pred_rfc[0]))
    return result


In [67]:
news_article=str(input())

In [68]:
result=manual_testing(news_article)
print(result)



LR Prediction: It is a Fake News 
GBC Prediction: It is a Fake News 
RFC Predtion: It is a Fake News


In [69]:
import pickle 
pickle.dump(gbc,open('gbc_model.pkl','wb'))

In [70]:
import pickle 
pickle.dump(LR,open('LR_model.pkl','wb'))

In [71]:
import pickle 
pickle.dump(rfc,open('rfc_model.pkl','wb'))

In [72]:
import pickle
pickle.dump(vectorization,open('vectorization_model.pkl','wb'))

In [73]:
gbc

In [74]:
LR

In [75]:
rfc