# Fake News Detector

In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

## Importing and Analysing Dataset

In [48]:
df_true = pd.read_csv("True.csv")
df_fake = pd.read_csv("Fake.csv")

In [49]:
df_true.columns

Index(['title', 'text', 'subject', 'date'], dtype='object')

In [50]:
df_fake.columns

Index(['title', 'text', 'subject', 'date'], dtype='object')

In [51]:
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [52]:
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


## Steps for building a Fake News Detector

- Preprocessing: Clean and preprocess the text data to remove irrelevant information, such as HTML tags, punctuation, and stopwords. Convert the text into a numerical representation that machine learning algorithms can process, such as using techniques like TF-IDF (Term Frequency-Inverse Document Frequency) or word embeddings like Word2Vec or GloVe.

- Feature Extraction: Extract relevant features from the preprocessed text data. These features can include word frequencies, n-grams, or other linguistic features that may help distinguish between real and fake news.

- Model Training: Choose a machine learning algorithm or model and train it on the labeled dataset. Split the dataset into training and testing sets to evaluate the model's performance.

- Model Evaluation: Evaluate the trained model using appropriate evaluation metrics like accuracy, precision, recall, and F1 score. Adjust the model parameters or try different algorithms if the results are not satisfactory.

## Preprocessing

In [53]:
# Adding a class feature for labeling the data whether if it is true or fake
df_true['class'] = 0
df_fake['class'] = 1

In [54]:
df_fake.shape, df_true.shape

((23481, 5), (21417, 5))

In [55]:
# Removing last 10 rows for manual testing
df_fake_testing = df_fake.tail(10)
for i in range(23480,23470,-1):
    df_fake.drop([i], axis = 0, inplace = True)
    
    
df_true_testing = df_true.tail(10)
for i in range(21416,21406,-1):
    df_true.drop([i], axis = 0, inplace = True)

In [56]:
# Creating a testing file
df_testing = pd.concat([df_fake_testing, df_true_testing], axis = 0)
df_testing.to_csv("testing.csv")

In [57]:
# Merging both files
df_merge = pd.concat([df_fake, df_true], axis =0 )
df_merge.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [58]:
# Removing unneccessary columns
df = df_merge.drop(["title", "subject","date"], axis = 1)

In [59]:
df.shape

(44878, 2)

In [60]:
# Checking for null values 
df.isnull().sum()

text     0
class    0
dtype: int64

In [61]:
# Shuffling the dataframe for better split and testing 
df = df.sample(frac = 1)
df.head()

Unnamed: 0,text,class
11033,Tim Poole is citizen journalist who s done som...,1
2928,Donald Trump literally just posted the ultimat...,1
1194,WASHINGTON (Reuters) - An Oct. 24 hearing in t...,0
12744,HANOI (Reuters) - Vietnamese police on Friday ...,0
10802,"RANCHO MIRAGE, Calif. (Reuters) - U.S. Preside...",0


In [62]:
df.reset_index(inplace=True)

In [63]:
df.drop(['index'], axis=1, inplace = True)

In [64]:
df

Unnamed: 0,text,class
0,Tim Poole is citizen journalist who s done som...,1
1,Donald Trump literally just posted the ultimat...,1
2,WASHINGTON (Reuters) - An Oct. 24 hearing in t...,0
3,HANOI (Reuters) - Vietnamese police on Friday ...,0
4,"RANCHO MIRAGE, Calif. (Reuters) - U.S. Preside...",0
...,...,...
44873,PLEASE NOTE THE OVERWHELMING INFORMATION REGAR...,1
44874,BERLIN (Reuters) - A mood of pre-election apat...,0
44875,Firebrand conservative Ann Coulter exposed Del...,1
44876,(This Oct. 18 story corrects age in paragraph...,0


### Processing the Text

In [65]:
def wordprocessing(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [66]:
df_copy = df.copy()

In [67]:
df["text"] = df["text"].apply(wordprocessing)

In [68]:
df['text']

0        tim poole is citizen journalist who s done som...
1        donald trump literally just posted the ultimat...
2        washington  reuters    an oct   hearing in the...
3        hanoi  reuters    vietnamese police on friday ...
4        rancho mirage  calif   reuters    u s  preside...
                               ...                        
44873    please note the overwhelming information regar...
44874    berlin  reuters    a mood of pre election apat...
44875    firebrand conservative ann coulter exposed del...
44876      this oct   story corrects age in paragraph  ...
44877     a bunch of thugs  a bunch of creeps  criminal...
Name: text, Length: 44878, dtype: object

In [69]:
df_r

0        washington  reuters    president donald trump ...
1        washington  reuters    president barack obama ...
2        remember the obamaphone lady  federal regulato...
3        tune in to the alternate current radio network...
4        moscow  reuters    former militants from  band...
                               ...                        
44873    new delhi  reuters    indian prime minister na...
44874     century wire says after a long drawn out roun...
44875    only one journalist has made it his mission to...
44876     clinton s radicalized rhetoric has championed...
44877    huh  this could be one of the most damning ema...
Name: text, Length: 44878, dtype: object

In [70]:
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

# Download necessary resources
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Perform stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Reconstruct the preprocessed text
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# Example usage
text = "This is an example sentence for preprocessing."
preprocessed_text = preprocess_text(text)
print(preprocessed_text)


exampl sentenc preprocess .


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [71]:
df_copy.head()

Unnamed: 0,text,class
0,Tim Poole is citizen journalist who s done som...,1
1,Donald Trump literally just posted the ultimat...,1
2,WASHINGTON (Reuters) - An Oct. 24 hearing in t...,0
3,HANOI (Reuters) - Vietnamese police on Friday ...,0
4,"RANCHO MIRAGE, Calif. (Reuters) - U.S. Preside...",0


In [72]:
df_copy['text2'] = df_copy['text'].apply(lambda x: preprocess_text(x))

In [73]:
df_copy.head()

Unnamed: 0,text,class,text2
0,Tim Poole is citizen journalist who s done som...,1,tim pool citizen journalist done realli import...
1,Donald Trump literally just posted the ultimat...,1,"donald trump liter post ultim say , tweet got ..."
2,WASHINGTON (Reuters) - An Oct. 24 hearing in t...,0,washington ( reuter ) - oct. 24 hear u.s. cong...
3,HANOI (Reuters) - Vietnamese police on Friday ...,0,hanoi ( reuter ) - vietnames polic friday arre...
4,"RANCHO MIRAGE, Calif. (Reuters) - U.S. Preside...",0,"rancho mirag , calif. ( reuter ) - u.s. presid..."


In [74]:
df['text2'] = df['text'].apply(lambda x: preprocess_text(x))

In [75]:
df.head()

Unnamed: 0,text,class,text2
0,tim poole is citizen journalist who s done som...,1,tim pool citizen journalist done realli import...
1,donald trump literally just posted the ultimat...,1,donald trump liter post ultim say tweet got ri...
2,washington reuters an oct hearing in the...,0,washington reuter oct hear u congress puerto r...
3,hanoi reuters vietnamese police on friday ...,0,hanoi reuter vietnames polic friday arrest for...
4,rancho mirage calif reuters u s preside...,0,rancho mirag calif reuter u presid barack obam...


## Machine Learning