Fake News Detections

In [2]:
#Import necessary libraries

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


In [9]:
#Load the dataset

df = pd.read_csv('fake_news.csv')
print(df.head()['text'])
print(df.shape)


0    Donald Trump just couldn t wish all Americans ...
1    House Intelligence Committee Chairman Devin Nu...
2    On Friday, it was revealed that former Milwauk...
3    On Christmas day, Donald Trump announced that ...
4    Pope Francis used his annual Christmas Day mes...
Name: text, dtype: object
(44898, 6)


In [14]:
#Preprocessing the dataset

# Remove duplicates
df.drop_duplicates(inplace=True)

# Drop any rows with missing values
df.dropna(inplace=True)

# Remove any unnecessary columns
df = df[['text', 'isFake']]

# Clean the text
def clean_text(text):
    # Remove punctuations and special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = [word for word in text.split() if word not in stop_words]
    
    text = ' '.join(text)
    
    return text

df['text'] = df['text'].apply(clean_text)

#print(df['text'])


In [33]:
#Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['isFake'], test_size=0.2, random_state=42)

In [30]:
#Create a TF-IDF vectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

#print(tfidf_vectorizer.get_feature_names())
#print(tfidf_train)




In [31]:
#Train the model

# Create a PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)

# Fit the model
pac.fit(tfidf_train, y_train)

# Predict on the test set and calculate accuracy
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print('Accuracy:', score)


Accuracy: 0.9924899650394924


In [35]:
#Evaluate the model

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print confusion matrix
print(cm)        #  [ [TP, FP],
                 #    [FN, TN]  ]


[[4198   23]
 [  35 3467]]
