# Fake News Detection Using Machine Learning: A Comparative Study of Classification Models


In [7]:
import pandas as pd

In [8]:
truedf = pd.read_csv("True.csv")

In [9]:
fakedf = pd.read_csv("Fake.csv")

In [10]:
truedf["label"] = 1
fakedf["label"] = 0

In [11]:
newsdf = pd.concat([truedf , fakedf], ignore_index=True)

In [12]:
newsdf = newsdf.sample(frac=1).reset_index(drop=True)

In [13]:
print(newsdf.head())

                                               title  \
0  Spain to suspend Catalonia's autonomy in respo...   
1  Thousands protest in Barcelona against Catalan...   
2   Fox News Yuks It Up, Taunts The Donald Over T...   
3  AMERICANS EXPECTED OBAMA TO CALL FOR CALM…Inst...   
4   National Republican Just Blamed Women For Ass...   

                                                text    subject  \
0  MADRID/BARCELONA (Reuters) - Spain s central g...  worldnews   
1  BARCELONA (Reuters) - Hundreds of thousands of...  worldnews   
2  Donald Trump has been gallivanting across Amer...       News   
3  Isn t it the job of the CURRENT president to c...  left-news   
4  Republican presidential candidate and current ...       News   

                date  label  
0  October 18, 2017       1  
1   October 8, 2017       1  
2   February 2, 2016      0  
3       Nov 14, 2016      0  
4     April 15, 2016      0  


In [14]:
print(newsdf['label'].value_counts())

label
0    23481
1    21417
Name: count, dtype: int64


In [15]:
import string

In [16]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [17]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saxen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saxen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
def preprocessing(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)


newsdf['clean_text'] = newsdf['text'].apply(preprocessing)


print(newsdf[['text', 'clean_text']].head())
    

                                                text  \
0  MADRID/BARCELONA (Reuters) - Spain s central g...   
1  BARCELONA (Reuters) - Hundreds of thousands of...   
2  Donald Trump has been gallivanting across Amer...   
3  Isn t it the job of the CURRENT president to c...   
4  Republican presidential candidate and current ...   

                                          clean_text  
0  madridbarcelona reuters spain central governme...  
1  barcelona reuters hundreds thousands people to...  
2  donald trump gallivanting across america month...  
3  job current president call calm supporters rio...  
4  republican presidential candidate current ohio...  


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(newsdf['clean_text'])


y = newsdf['label']


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [23]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [25]:
X_train ,X_test, y_train,y_test = train_test_split(X,y , random_state= 42, test_size=.02)

In [27]:

model = LogisticRegression()
model.fit(X_train, y_train)

In [28]:
y_pred = model.predict(X_test)

In [29]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9866369710467706

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       480
           1       0.98      0.99      0.99       418

    accuracy                           0.99       898
   macro avg       0.99      0.99      0.99       898
weighted avg       0.99      0.99      0.99       898


Confusion Matrix:
 [[471   9]
 [  3 415]]


In [33]:
def predict_news(text):
    clean = preprocessing(text)
    vec = vectorizer.transform([clean])
    pred = model.predict(vec)
    return "FAKE" if pred[0] == 0 else "REAL"


In [35]:
sample_news = "Breaking: Government confirms the new vaccine rollout will start next week."
print("Prediction:", predict_news(sample_news))


Prediction: FAKE


RANDOMFOREST AND DECISION TREE ACCURACY


In [38]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [40]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
ypreds = model.predict(X_test)
acc = accuracy_score(y_test, ypreds)

In [41]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
yypreds = model.predict(X_test)
acc = accuracy_score(y_test, yypreds)

In [44]:
print("Decision Tree Accuracy:", acc)
print("Random Forest Accuracy:", acc)

Decision Tree Accuracy: 0.9988864142538976
Random Forest Accuracy: 0.9988864142538976


SVM MODEL PREDICTION

In [46]:
from sklearn.svm import LinearSVC

In [55]:
model = LinearSVC()
model.fit(X_train,y_train)
yyypred = model.predict(X_test)
acc = accuracy_score(y_test, yyypred)

In [57]:
print("svm Accuracy:", acc)

svm Accuracy: 0.9933184855233853


#NAIVE Bayes

In [59]:
from sklearn.naive_bayes import MultinomialNB

In [61]:
model = MultinomialNB()
model.fit(X_train,y_train)
yyyypred = model.predict(X_test)
acc = accuracy_score(y_test, yyyypred)

In [63]:
print("Naive Bayes Accuracy::", acc)

Naive Bayes Accuracy:: 0.933184855233853


| Model              | Accuracy (%)         |
|--------------------|----------------------|
| Logistic Regression| 0.9866369710467706   |
| Decision Tree      | 0.9988864142538976   |
| Random Forest      | 0.9988864142538976   |   
| SVM                | 0.9933184855233853   |
| Naive Bayes        | 0.933184855233853    |
