In [79]:
#All imported librairies for the Project
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.ensemble import GradientBoostingClassifier



In [10]:
#Importation of Dataset and Read with pandas
fake= pd.read_csv("Fake.csv")
true= pd.read_csv("True.csv")

In [11]:
#Copy the dataset to not lose them in case of an error  
data_fake= fake.copy()
data_true= true.copy()

In [15]:
#Verify if there is a missing values on the fake dataset
fake.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [16]:
#Display the dimension of fake dataset
fake.shape

(23481, 4)

In [17]:
#Adding a new col to the fake dataset and assign the value of 1 
fake['Result']= 1

In [19]:
#Drop the columns that are not required on the fake dataset
fake.drop(["title","subject","date"],axis=1,inplace=True)

In [21]:
#Verify if there is a missing values on the true dataset
true.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [22]:
#Display the dimension of true dataset
true.shape

(21417, 4)

In [23]:
#Adding a new col to the true dataset and assign the value of 0 
true['Result']=0

In [25]:
#Drop the columns that are not required on the true dataset
true.drop(['title','subject','date'],axis=1,inplace=True)

In [27]:
#Merging the fake and true dataset
overall= pd.concat([fake,true],axis=0)

In [29]:
#Add the new index 
overall.reset_index(inplace=True)

In [31]:
#Drop the old index 
overall.drop(['index'],axis=1,inplace=True)

In [33]:
#Random Shuffling the dataframe 
overall= overall.sample(frac=1)

In [72]:
overall.isnull().sum()

text      0
Result    0
dtype: int64

In [35]:
#Creating a function to process the texts
def wordopt(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r"\W", " ", text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w\d\w*', '', text)
    return text

In [36]:
#Application of the function on the variable text
overall['text']=overall['text'].apply(wordopt)

In [37]:
#Defining the features and targets variables
target= overall['Result']
feature= overall['text']

In [38]:
#Count the values
target.value_counts()

1    23481
0    21417
Name: Result, dtype: int64

In [39]:
#Splitting Training and Testing
x_train ,x_test, y_train, y_test = train_test_split(feature, target,test_size=0.2,random_state= 42)

In [40]:
#Creation of the vector that convert text to vectors
vecteur= TfidfVectorizer()

In [41]:
#Convert text to vectors
xv_train= vecteur.fit_transform(x_train)
xv_test= vecteur.transform(x_test)

In [42]:
#Creation of Logistic Regression Model
model= LogisticRegression()

In [43]:
#Training the model
model.fit(xv_train,y_train)

In [44]:
#Prediction of the test
predL= model.predict(xv_test)

In [45]:
#Displaying the score of the test
model.score(xv_test,y_test)

0.9867483296213808

In [46]:
#Comparing the prediction to the original values
print(classification_report(y_test,predL))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4329
           1       0.99      0.98      0.99      4651

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [47]:
#Creation of a second model DecisionTreeClassifier
model_1= DecisionTreeClassifier() 

In [48]:
#Training the second model
model_1.fit(xv_train,y_train)

In [49]:
#Prediction of the second test
predL_1= model_1.predict(xv_test)

In [50]:
#Displaying the score of the second test
model_1.score(xv_test,y_test)

0.9952115812917595

In [51]:
#Comparing the prediction of the second test to their original values
print(classification_report(y_test,predL_1))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      4329
           1       0.99      1.00      1.00      4651

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



In [60]:
#Creation of an thrid model GradientBoostingClassifier and Trainig it 
model_2= GradientBoostingClassifier(random_state=0)
model_2.fit(xv_train,y_train)

In [61]:
#Prediction of the third test
predL_2= model_2.predict(xv_test)

In [62]:
#Displaying the score of the third test
model_2.score(xv_test,y_test)

0.9955456570155902

In [64]:
#Comparing the prediction of the third test with the original values 
print(classification_report(y_test,predL_2))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      4329
           1       1.00      0.99      1.00      4651

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



In [65]:
#Fontion for testing the models
def sortie(t):
    if t==1:
        return "This is a Fake News"
    elif t==0:
        return "This isn't a Fake News"

def manuel(textes):
    dico= {"text":[textes]}
    df= pd.DataFrame(dico)
    df["text"]= df["text"].apply(wordopt)
    xv= df["text"]
    xv1_test= vecteur.transform(xv)
    predL= model.predict(xv1_test)
    predL_1= model_1.predict(xv1_test)
    predL_2= model_2.predict(xv1_test)
    return print("Regression Logistique: {}\nDecisionTreeClassifier: {}\nGradientBoostingClassifier: {}".format(sortie(predL[0]),sortie(predL_1[0]),sortie(predL_2[0])))

In [71]:
#Execution of the model and result
k= str(input())
manuel(k)

ALGIERS (Reuters) - French President Emmanuel 
Regression Logistique: This isn't a Fake News
DecisionTreeClassifier: This isn't a Fake News
GradientBoostingClassifier: This isn't a Fake News
