In [None]:
import numpy as np
import pandas as pd
import re # for regular expression, when we have textual data, regular expresion comes handy, as we use a pattern when we want to look for some data in the text
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords #nltk is natural language tool kit, where using stopwords, we will get list of all those words which has less meaning
#corpus here deals with large collection of textual data
from nltk.stem.porter import PorterStemmer #porterstemmer reduces words to its root word
from sklearn.feature_extraction.text import TfidfVectorizer #vectorizer converts textual to numerical data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
#now lets donwload the stopwords
import nltk
nltk.download('stopwords')
stopwordsList = stopwords.words('english')
print(stopwordsList)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#read data
df = pd.read_csv('/content/fake_news_dataset.csv')
df.head()

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake


In [None]:
#lets see how many null data do we have
df.isnull().sum()

Unnamed: 0,0
title,0
text,0
date,0
source,1000
author,1000
category,0
label,0


In [None]:
#most of the time auhtor and title is generally not found in the news, maybe thats why most of them are not present in data
#lets replace them with empty string
df = df.fillna('')

In [None]:
df.isnull().sum()

Unnamed: 0,0
title,0
text,0
date,0
source,0
author,0
category,0
label,0


In [None]:
df.shape

(20000, 7)

In [None]:
#lets combine category + author + source + title as a single content, and train it against label
# df['content'] = df['category'] +' ' + df['author'] +' '+ df['source'] +' '+ df['title']
df['content'] =  df['text']
df.head()

Unnamed: 0,title,text,date,source,author,category,label,content
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real,more tax development both store agreement lawy...
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake,probably guess western behind likely next inve...
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake,them identify forward present success risk sev...
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake,phone which item yard Republican safe where po...
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake,wonder myself fact difficult course forget exa...


In [None]:
#now lets remove stopwords and perform stemming on top of content
port_stem = PorterStemmer()
def stemming(content):
  #lets first remove the numbers from the content, and just keep the letters
  content = re.sub('[^a-zA-Z]', ' ',content) #search, replace, inTheString
  #convert all to a unified case
  content = content.lower()
  #break down sentence into words
  content = content.split()
  #now lets remove if the word is in stopWordList, if not apply stemmer
  content = [port_stem.stem(word) for word in content if word not in stopwordsList]
  #join all the words, and return it into a sentence
  content = ' '.join(content)
  return content

In [None]:
#now lets create a method for the label, if real then 0, if fake then 1
def labelfunc(inp):
  if inp.lower() == 'real':
    return 0
  else:
    return 1

df['label'] = df['label'].apply(labelfunc)
df['label'].head()

Unnamed: 0,label
0,0
1,1
2,1
3,1
4,1


In [None]:
#now lets fetch the x and y feature
x = df['content'].values
y = df['label'].values
print(x.shape, y.shape)

(20000,) (20000,)


In [None]:
#lets apply vectorization i.e convert textual input data to numerical output data
vectorizer = TfidfVectorizer()
vectorizer.fit(x)
x = vectorizer.transform(x)
x

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4397197 stored elements and shape (20000, 969)>

In [None]:
print(x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4397197 stored elements and shape (20000, 969)>
  Coords	Values
  (0, 2)	0.0603562207661069
  (0, 7)	0.05997771658524611
  (0, 8)	0.05976704775954439
  (0, 21)	0.06063179639427085
  (0, 26)	0.06020624531253104
  (0, 27)	0.0602490008918108
  (0, 31)	0.06004657985065417
  (0, 33)	0.06048011247571504
  (0, 45)	0.06043694788845418
  (0, 53)	0.060291831977671566
  (0, 60)	0.0603562207661069
  (0, 67)	0.06022761368049072
  (0, 79)	0.059584201796619676
  (0, 81)	0.06005188513254458
  (0, 86)	0.06058836124307728
  (0, 90)	0.12093862824932324
  (0, 92)	0.06038309996210357
  (0, 99)	0.06049631902230227
  (0, 100)	0.059851095826557274
  (0, 101)	0.060174228019346125
  (0, 108)	0.060078428963655714
  (0, 109)	0.06056125372257863
  (0, 115)	0.05963630298972909
  (0, 116)	0.06045852057928306
  (0, 132)	0.12149237031196183
  :	:
  (19999, 871)	0.061548800909775596
  (19999, 876)	0.06198700730854452
  (19999, 890)	0.061397233750172725
  (19

In [None]:
print(y)

[0 1 1 ... 0 1 1]


In [None]:
#now we can use this numerical data in our LogisticRegresion to perform classification
#lets first split the data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.1, stratify=y, random_state=2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(18000, 969) (2000, 969) (18000,) (2000,)


In [None]:
#LogisticRegression
model = LogisticRegression()
model.fit(x_train, y_train)
model

In [None]:
#lets test it on training data
y_train_pred  = model.predict(x_train)
print(accuracy_score(y_train_pred, y_train))

0.5989444444444444


In [None]:
#now lets test it on test data
y_test_pred  = model.predict(x_test)
print(accuracy_score(y_test_pred, y_test))

0.511


In [None]:
#lets predict on one data
print(y_test[0])
model.predict(x_test[0])

0


array([0])

In [None]:
print(y_test[1])
model.predict(x_test[1])

0


array([1])

(20000, 8)