
#Fake News Classification
### Dataset : [Fake and Real news Dataset](https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset)

```
 Columns and  Description
 -------------------------
 Title : Title of the Article
 Text : Content of the Article
 Subject : Type of content ( here News )
 Date : Date of publishing of the Article ( Month Date,Year )
```



In [159]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
import string
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from sklearn.linear_model import PassiveAggressiveClassifier


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [160]:
true_df=pd.read_csv('/content/True.csv')
flase_df=pd.read_csv('/content/Fake.csv')
true_df.head(2)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"


In [161]:
true_df.columns

Index(['title', 'text', 'subject', 'date'], dtype='object')

In [162]:
flase_df.columns

Index(['title', 'text', 'subject', 'date'], dtype='object')

In [163]:
true_df['label'] = 1
flase_df['label']= 0

In [164]:
combined_df=pd.concat([true_df,flase_df],ignore_index=True)
combined_df.head(2)

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1


In [165]:
combined_df.tail(2)

Unnamed: 0,title,text,subject,date,label
44896,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0
44897,10 U.S. Navy Sailors Held by Iranian Military ...,21st Century Wire says As 21WIRE predicted in ...,Middle-east,"January 12, 2016",0


In [166]:
#shuffle the dataset
combined_df = combined_df.sample(frac=1).reset_index(drop=True)
combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,Bangladesh seeks support to move fleeing Rohin...,DHAKA (Reuters) - Bangladesh is seeking intern...,worldnews,"September 11, 2017",1
1,Argentina set to pass pension reform despite m...,BUENOS AIRES (Reuters) - Argentina s Congress ...,worldnews,"December 18, 2017",1
2,"Clinton calls Trump a 'loose cannon,' risky ch...",WASHINGTON (Reuters) - U.S. Democratic preside...,politicsNews,"May 4, 2016",1
3,"Julian Assange – “Everything that he has said,...",21st Century Wire says Judging by Julian Assan...,US_News,"January 17, 2017",0
4,U.S. Republican Rand Paul suspends 2016 White ...,WASHINGTON (Reuters) - U.S. Republican Rand Pa...,politicsNews,"February 3, 2016",1


In [167]:
combined_df.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [168]:
combined_df.title[0]

'Bangladesh seeks support to move fleeing Rohingya to remote, flood-prone island'

In [169]:
combined_df.text[1]

'BUENOS AIRES (Reuters) - Argentina s Congress was on track to pass a pension reform measure on Monday, even as stone-throwing protesters rushed police outside the capitol building and the country s main union called a 24-hour general strike in opposition to the proposal. President Mauricio Macri, elected in 2015 with a mandate to lift the heavy-handed currency and trade controls favored by his  predecessors, says Argentina needs pension reform to cut its  deficit, attract investment and promote sustainable growth. Debate on the bill was suspended on Thursday due to violent demonstrations. Macri then promised to decree a bonus payment to the neediest retirees. But that did nothing to satisfy the opposition and union activists who marched on Congress again on Monday as lawmakers debated the proposal inside. Balaclava-wearing protesters used sling shots to fire rocks at police, who answered with water canon and tear gas, turning the vast lawn in front of the capitol into a battleground. 

In [170]:
import string

def replace_url(s):
    return re.sub(r'http\S+', ' URL ', s) #to remove links

def replace_mentions(s):
    return re.sub(r'@([A-Za-z0-9_]+)', ' MENTION ') # to remove mentions

def replace_nums(s):
    return re.sub(r'\d+', ' NUM ', s) #replace numbers with NUM

def remove_punct(s):
    return ''.join(x for x in s if x not in string.punctuation) # String without punctuations

def whitespace_regularization(s):
    # removng consequtive spaces in the string,
    return re.sub(r'\s+', ' ', s)

def clean_text(s):
    temp = replace_url(s.lower())
    temp = replace_nums(temp)
    temp = whitespace_regularization(temp)
    temp = remove_punct(temp)
    return temp
combined_df["cleaned_text"] = combined_df.text.apply(clean_text)


In [171]:
def clean_title(s):
  title=s.lower()
  title=replace_nums(title)
  title=remove_punct(title)
  return title
combined_df['cleaned_title']=combined_df.title.apply(clean_title)
combined_df['cleaned_title'][0]



'bangladesh seeks support to move fleeing rohingya to remote floodprone island'

In [172]:
combined_df['cleaned_text'][1]

'buenos aires reuters  argentina s congress was on track to pass a pension reform measure on monday even as stonethrowing protesters rushed police outside the capitol building and the country s main union called a NUM hour general strike in opposition to the proposal president mauricio macri elected in NUM with a mandate to lift the heavyhanded currency and trade controls favored by his predecessors says argentina needs pension reform to cut its deficit attract investment and promote sustainable growth debate on the bill was suspended on thursday due to violent demonstrations macri then promised to decree a bonus payment to the neediest retirees but that did nothing to satisfy the opposition and union activists who marched on congress again on monday as lawmakers debated the proposal inside balaclavawearing protesters used sling shots to fire rocks at police who answered with water canon and tear gas turning the vast lawn in front of the capitol into a battleground this bill will put m

In [173]:
combined_df['Article']= combined_df['cleaned_title']+combined_df['cleaned_text']
combined_df = combined_df.drop(columns=['title','text','cleaned_title','cleaned_text'],axis=1)
combined_df

Unnamed: 0,subject,date,label,Article
0,worldnews,"September 11, 2017",1,bangladesh seeks support to move fleeing rohin...
1,worldnews,"December 18, 2017",1,argentina set to pass pension reform despite m...
2,politicsNews,"May 4, 2016",1,clinton calls trump a loose cannon risky choic...
3,US_News,"January 17, 2017",0,julian assange – “everything that he has said ...
4,politicsNews,"February 3, 2016",1,us republican rand paul suspends NUM white h...
...,...,...,...,...
44893,News,"January 12, 2016",0,ellen degeneres tells hillary clinton to her ...
44894,worldnews,"December 21, 2017",1,british police say lifting road closures near ...
44895,politics,"Jan 25, 2017",0,fake news alert cnn finally releases actual pi...
44896,worldnews,"November 21, 2017",1,uk pm may says believes northern irish accord ...


## Performing Lemmatization on news contents and titles


 lemmatization generally produces root form of words.It considers the context and meaning of words. Lemmatization also takes into account the part of speech and tries to find the base form accordingly.



In [174]:
def lemmatizing(Article):
  lemmatized=re.sub('[^a-zA-Z]',' ',Article)
  lemmatized=lemmatized.lower()

    # Tokenize the lemmatized article
  tokens = nltk.word_tokenize(lemmatized)

  # Initialize the WordNetLemmatizer
  lemmatizer = WordNetLemmatizer()

  # Lemmatize the tokens
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

  # Join the lemmatized tokens back into a string
  lemmatized_article = ' '.join(lemmatized_tokens)

  return lemmatized_article

In [175]:
combined_df['Article']=combined_df['Article'].apply(lemmatizing)
combined_df

Unnamed: 0,subject,date,label,Article
0,worldnews,"September 11, 2017",1,bangladesh seek support to move fleeing rohing...
1,worldnews,"December 18, 2017",1,argentina set to pas pension reform despite me...
2,politicsNews,"May 4, 2016",1,clinton call trump a loose cannon risky choice...
3,US_News,"January 17, 2017",0,julian assange everything that he ha said he s...
4,politicsNews,"February 3, 2016",1,u republican rand paul suspends num white hous...
...,...,...,...,...
44893,News,"January 12, 2016",0,ellen degeneres tell hillary clinton to her fa...
44894,worldnews,"December 21, 2017",1,british police say lifting road closure near b...
44895,politics,"Jan 25, 2017",0,fake news alert cnn finally release actual pic...
44896,worldnews,"November 21, 2017",1,uk pm may say belief northern irish accord pos...


### **As we can notice words have been transformed to ther root words :<br>**


```
Before :turkey charges three bulgarians with migrant smu..
After :turkey charge three bulgarian with migrant smu...
```




In [176]:
X=combined_df['Article']
Y=combined_df['label']

In [177]:
print(X)

0        bangladesh seek support to move fleeing rohing...
1        argentina set to pas pension reform despite me...
2        clinton call trump a loose cannon risky choice...
3        julian assange everything that he ha said he s...
4        u republican rand paul suspends num white hous...
                               ...                        
44893    ellen degeneres tell hillary clinton to her fa...
44894    british police say lifting road closure near b...
44895    fake news alert cnn finally release actual pic...
44896    uk pm may say belief northern irish accord pos...
44897    afghanistan political turmoil deepens a region...
Name: Article, Length: 44898, dtype: object


In [178]:
print(Y)

0        1
1        1
2        1
3        0
4        1
        ..
44893    0
44894    1
44895    0
44896    1
44897    1
Name: label, Length: 44898, dtype: int64


In [179]:
vectorizer=TfidfVectorizer()
vectorizer.fit(X)
X=vectorizer.transform(X)

In [180]:
print(X)

  (0, 217529)	0.03794775469662587
  (0, 215592)	0.027344906939694404
  (0, 215363)	0.03906476092826124
  (0, 214454)	0.012182127913204426
  (0, 212940)	0.01737531796024328
  (0, 212894)	0.024263612374401815
  (0, 212885)	0.03689750622343907
  (0, 212029)	0.02642994094736015
  (0, 210638)	0.041785798942745846
  (0, 209230)	0.02621408703299258
  (0, 207803)	0.0723679787509896
  (0, 204216)	0.030145243403962536
  (0, 203994)	0.04111206407286128
  (0, 202891)	0.10422598463770301
  (0, 202660)	0.022778579090724515
  (0, 200529)	0.022056154032919866
  (0, 196711)	0.05705011775105988
  (0, 195498)	0.029976875431393445
  (0, 194675)	0.16367764480335104
  (0, 192920)	0.031633632931775804
  (0, 192589)	0.028356403779485808
  (0, 191941)	0.01935216392155152
  (0, 191719)	0.20845196927540602
  (0, 191271)	0.021966085235606076
  (0, 191100)	0.13050738632713696
  :	:
  (44897, 16782)	0.027030678747125718
  (44897, 15782)	0.06157918359469137
  (44897, 15164)	0.03008071092889189
  (44897, 13556)	0.034

In [181]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [182]:
print(Y_test.reset_index())

      index  label
0     41450      1
1     25207      1
2     11901      0
3     35537      0
4     32758      0
...     ...    ...
8975  18539      1
8976  44234      1
8977  33934      0
8978  11722      0
8979  10187      0

[8980 rows x 2 columns]


In [183]:
# Training the Model
model1=LogisticRegression()
model2= PassiveAggressiveClassifier(C = 0.5, random_state = 5)
model1.fit(X_train,Y_train)

In [184]:
model2.fit(X_train,Y_train)

In [185]:
# Accuracy on training data
training_pred=model1.predict(X_train)
print(accuracy_score(training_pred,Y_train))

0.9909516120051228


In [186]:
training_pred2=model2.predict(X_train)
print(accuracy_score(training_pred2,Y_train))

0.9999443176123393


In [187]:
# Acuuracy on Test Data :
testing_pred=model1.predict(X_test)
print(accuracy_score(testing_pred,Y_test))

0.9865256124721603


In [188]:
training_pred2=model2.predict(X_test)
print(accuracy_score(training_pred2,Y_test))

0.9944320712694877


# Making a Prediction System

In [189]:
X_new=X_test[0]

pred=model2.predict(X_new)
print(pred)

if(pred[0] == 0):
  print("The news is Real")
else:
  print("The news is fake")

[1]
The news is fake


In [191]:
X_new=X_test[1]

pred=model2.predict(X_new)
print(pred)

if(pred[0] == 0):
  print("The news is Real")
else:
  print("The news is fake")

[1]
The news is fake
