In [11]:
import numpy as p
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [15]:
## DATA PREPROCESSING

news_df = pd.read_csv(r'G:\ML projects\fake-news-detection\dataset\fake_news_dataset.csv')
news_df.shape

(20000, 7)

In [16]:
news_df.head(3)

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake


In [18]:
### COUNT THE NO.OF MISSING VALUES IN ALL DATASET

news_df.isnull().sum()

title          0
text           0
date           0
source      1000
author      1000
category       0
label          0
dtype: int64

In [20]:
news_df = news_df.fillna('')

In [21]:
news_df['content'] = news_df['author'] + '' + news_df['title']
print(news_df['content'])

0                      Paula GeorgeForeign Democrat final.
1           Joseph HillTo offer down resource great point.
2               Julia RobinsonHimself church myself carry.
3                 Mr. David Foster DDSYou unit its should.
4        Austin WalkerBillion believe employee summer how.
                               ...                        
19995                          Gary MilesHouse party born.
19996    Maria McbrideThough nation people maybe price ...
19997      Kristen FranklinYet exist with experience unit.
19998                   David WiseSchool wide itself item.
19999         James PetersonOffer chair cover senior born.
Name: content, Length: 20000, dtype: object


In [22]:
## Seperate labels and features

X = news_df.drop(columns='label' , axis=1)
Y = news_df['label']

In [27]:
print(X.columns)

Index(['title', 'text', 'date', 'source', 'author', 'category', 'content'], dtype='object')


In [28]:
print(X.content)

0                      Paula GeorgeForeign Democrat final.
1           Joseph HillTo offer down resource great point.
2               Julia RobinsonHimself church myself carry.
3                 Mr. David Foster DDSYou unit its should.
4        Austin WalkerBillion believe employee summer how.
                               ...                        
19995                          Gary MilesHouse party born.
19996    Maria McbrideThough nation people maybe price ...
19997      Kristen FranklinYet exist with experience unit.
19998                   David WiseSchool wide itself item.
19999         James PetersonOffer chair cover senior born.
Name: content, Length: 20000, dtype: object


In [29]:
print(Y)

0        real
1        fake
2        fake
3        fake
4        fake
         ... 
19995    fake
19996    real
19997    real
19998    fake
19999    fake
Name: label, Length: 20000, dtype: object


In [44]:
port_stem = PorterStemmer()

In [45]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]' , '', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ''.join(stemmed_content)
    return stemmed_content

In [47]:
news_df['content'] = news_df['content'].apply(stemming)
print(news_df['content'])

0                      paulageorgeforeigndemocratfin
1            josephhilltoofferdownresourcegreatpoint
2              juliarobinsonhimselfchurchmyselfcarri
3                   mrdavidfosterddsyouunititsshould
4        austinwalkerbillionbelieveemployeesummerhow
                            ...                     
19995                        garymileshousepartyborn
19996    mariamcbridethoughnationpeoplemaybepricebox
19997      kristenfranklinyetexistwithexperienceunit
19998                  davidwiseschoolwideitselfitem
19999         jamespetersonofferchaircoverseniorborn
Name: content, Length: 20000, dtype: object


In [53]:
## Let's use label encoder to convert string categorical data to numerical data

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
news_df['label'] = le.fit_transform(news_df['label'])
news_df.head(2)

Unnamed: 0,title,text,date,source,author,category,label,content
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,1,paulageorgeforeigndemocratfin
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,0,josephhilltoofferdownresourcegreatpoint


In [54]:
X = news_df['content'].values
Y = news_df['label'].values

In [55]:
print(X)

['paulageorgeforeigndemocratfin' 'josephhilltoofferdownresourcegreatpoint'
 'juliarobinsonhimselfchurchmyselfcarri' ...
 'kristenfranklinyetexistwithexperienceunit'
 'davidwiseschoolwideitselfitem' 'jamespetersonofferchaircoverseniorborn']


In [56]:
print(Y)

[1 0 0 ... 1 0 0]


In [57]:
Y.shape

(20000,)

In [58]:
## Converting text ata to numerical

vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 20000 stored elements and shape (20000, 20000)>
  Coords	Values
  (0, 15227)	1.0
  (1, 9652)	1.0
  (2, 9990)	1.0
  (3, 14250)	1.0
  (4, 1701)	1.0
  (5, 17552)	1.0
  (6, 539)	1.0
  (7, 18889)	1.0
  (8, 7055)	1.0
  (9, 6313)	1.0
  (10, 5572)	1.0
  (11, 955)	1.0
  (12, 5379)	1.0
  (13, 16540)	1.0
  (14, 6716)	1.0
  (15, 10222)	1.0
  (16, 7815)	1.0
  (17, 4698)	1.0
  (18, 1033)	1.0
  (19, 5416)	1.0
  (20, 7494)	1.0
  (21, 14783)	1.0
  (22, 9236)	1.0
  (23, 19000)	1.0
  (24, 9358)	1.0
  :	:
  (19975, 16152)	1.0
  (19976, 13824)	1.0
  (19977, 3251)	1.0
  (19978, 12438)	1.0
  (19979, 8158)	1.0
  (19980, 572)	1.0
  (19981, 12946)	1.0
  (19982, 15266)	1.0
  (19983, 12906)	1.0
  (19984, 4966)	1.0
  (19985, 4225)	1.0
  (19986, 11255)	1.0
  (19987, 14218)	1.0
  (19988, 19470)	1.0
  (19989, 7330)	1.0
  (19990, 7302)	1.0
  (19991, 14915)	1.0
  (19992, 19136)	1.0
  (19993, 349)	1.0
  (19994, 17081)	1.0
  (19995, 6632)	1.0
  (19996, 12414)	

In [59]:
## splitting the training and testing data 

X_train , X_test , Y_train , Y_test = train_test_split( X , Y , test_size=0.2 , stratify = Y , random_state=2)


In [60]:
X_train.shape

(16000, 20000)

In [61]:
Y_train.shape

(16000,)

In [62]:
## Training the model 

log_reg_model = LogisticRegression()
log_reg_model.fit(X_train , Y_train)

In [64]:
## Evaluation on training data

X_train_prediction = log_reg_model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction , Y_train)
print('Accuracy score of the training data :' , training_data_accuracy)

Accuracy score of the training data : 1.0


In [65]:
## Evaluation on test data

X_test_prediction = log_reg_model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction , Y_test)

In [None]:
print('')