In [77]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [78]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [79]:
#printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [80]:
# load the dataset to a pandas
news_dataset = pd.read_csv('101.csv',engine='python',encoding='cp1252')

In [81]:
news_dataset.shape

(30, 5)

In [82]:
# print rows of the data set
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,Rotary Club provides 28 different medical equi...,Keshav Adhikari,Dhading Hospital has received ventilators and ...,0
1,1,New Ambassador of India to Nepal arrives in Ka...,THT Online,"Naveen Srivastava, the new Ambassador of India...",0
2,2,"KMC unveils policies, programmes for new fiscal",HIMALAYAN NEWS SERVICE,Kathmandu Metropolitan City has unveiled the p...,0
3,3,Journalist Rabi Lamichhane announces new polit...,HIMALYAN NEWS SERVICE\n,Former TV anchor Rabi Lamichhane has announced...,0
4,4,NOC slashes fuel prices,Himalayan News Service\n,Offering some respite to the general public gr...,0


In [83]:
#counting the null values
news_dataset.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [84]:
# removing the null values with empty string
news_dataset = news_dataset.fillna('')

In [85]:
# merging the author name and news title
news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']

In [86]:
news_dataset.content

0     Keshav Adhikari Rotary Club provides 28 differ...
1     THT Online New Ambassador of India to Nepal ar...
2     HIMALAYAN NEWS SERVICE KMC unveils policies, p...
3     HIMALYAN NEWS SERVICE\n Journalist Rabi Lamich...
4      Himalayan News Service\n NOC slashes fuel prices
5     Rastriya Samachar Samiti Karnali Chief Ministe...
6     Ashish Dhakal In Kathmandu, Lalitpur and Bhakt...
7     Sanjog Shiwakoti KP Sharma Oli wrongly refers ...
8     Rastriya Samachar Samiti\n Nepal struggles wit...
9     Injina Panthi Ram Kumari Jhakri’s claim that O...
10    South Asia Check Commentator Saurabh  claims a...
11    Deepak Adhikari Some Nepali social media users...
12    Himalayan News Service Corruption case against...
13    Rastriya Samachar Samiti First kidney transpla...
14    Binod Ghimire No legal clarity on how long the...
15    Onlinekhabar China praises Nepal’s decision to...
16    Deepak Adhikari Viral on Nepal social media, ‘...
17    Birat Anupam   Itahari designates indigeno

In [87]:
# separating the data & label
X = news_dataset.drop(columns='label',axis=1)
Y =news_dataset['label']

In [88]:
print(X)
print(Y)

    id                                              title  \
0    0  Rotary Club provides 28 different medical equi...   
1    1  New Ambassador of India to Nepal arrives in Ka...   
2    2    KMC unveils policies, programmes for new fiscal   
3    3  Journalist Rabi Lamichhane announces new polit...   
4    4                            NOC slashes fuel prices   
5    5  Karnali Chief Minister reaches Mugu to provide...   
6    6  In Kathmandu, Lalitpur and Bhaktapur, the pace...   
7    7  KP Sharma Oli wrongly refers to little people ...   
8    8  Nepal struggles with high adolescent fertility...   
9    9  Ram Kumari Jhakri’s claim that Oli didn’t brin...   
10  10  Commentator Saurabh  claims a 1958 postage sta...   
11  11  Some Nepali social media users shared 5G incre...   
12  12  Corruption case against three in Bara's Jitpur...   
13  13            First kidney transplantation in Pokhara   
14  14  No legal clarity on how long the House of Repr...   
15  15  China praises Ne

In [89]:
port_stem = PorterStemmer()


In [90]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [91]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [92]:
news_dataset['content']

0     keshav adhikari rotari club provid differ medi...
1     tht onlin new ambassador india nepal arriv kat...
2     himalayan news servic kmc unveil polici progra...
3     himalyan news servic journalist rabi lamichhan...
4            himalayan news servic noc slash fuel price
5     rastriya samachar samiti karnali chief minist ...
6     ashish dhakal kathmandu lalitpur bhaktapur pac...
7     sanjog shiwakoti kp sharma oli wrongli refer l...
8     rastriya samachar samiti nepal struggl high ad...
9     injina panthi ram kumari jhakri claim oli brin...
10    south asia check comment saurabh claim postag ...
11    deepak adhikari nepali social media user share...
12    himalayan news servic corrupt case three bara ...
13    rastriya samachar samiti first kidney transpla...
14    binod ghimir legal clariti long hous repres fu...
15    onlinekhabar china prais nepal decis join us l...
16    deepak adhikari viral nepal social media home ...
17    birat anupam itahari design indigen tharu 

In [93]:
#seperating the data and the label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [94]:
 print(X)
 print(Y)

['keshav adhikari rotari club provid differ medic equip dhade hospit'
 'tht onlin new ambassador india nepal arriv kathmandu saturday'
 'himalayan news servic kmc unveil polici programm new fiscal'
 'himalyan news servic journalist rabi lamichhan announc new polit parti'
 'himalayan news servic noc slash fuel price'
 'rastriya samachar samiti karnali chief minist reach mugu provid relief fire victim'
 'ashish dhakal kathmandu lalitpur bhaktapur pace authent post heritag restor differ'
 'sanjog shiwakoti kp sharma oli wrongli refer littl peopl africa lilliput'
 'rastriya samachar samiti nepal struggl high adolesc fertil rate'
 'injina panthi ram kumari jhakri claim oli bring covid vaccin'
 'south asia check comment saurabh claim postag stamp pointi map nepal'
 'deepak adhikari nepali social media user share g increas covid case'
 'himalayan news servic corrupt case three bara jitpursimara'
 'rastriya samachar samiti first kidney transplant pokhara'
 'binod ghimir legal clariti long hous

In [95]:
X.shape

(30,)

In [96]:
Y.shape

(30,)

In [97]:
#converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [98]:
print(X)


  (0, 186)	0.3287324038512732
  (0, 166)	0.2931014977409513
  (0, 128)	0.3287324038512732
  (0, 105)	0.3287324038512732
  (0, 84)	0.3287324038512732
  (0, 55)	0.3287324038512732
  (0, 52)	0.2931014977409513
  (0, 49)	0.3287324038512732
  (0, 36)	0.3287324038512732
  (0, 2)	0.26782096753459433
  (1, 208)	0.3687609495720031
  (1, 191)	0.3687609495720031
  (1, 141)	0.26046299825901637
  (1, 135)	0.30043254983780665
  (1, 133)	0.2321041501036102
  (1, 103)	0.3287913979932128
  (1, 88)	0.3687609495720031
  (1, 10)	0.3687609495720031
  (1, 6)	0.3687609495720031
  (2, 212)	0.3694775867210055
  (2, 194)	0.26096917295215993
  (2, 164)	0.3694775867210055
  (2, 154)	0.3694775867210055
  (2, 136)	0.26096917295215993
  (2, 135)	0.301016399961398
  :	:
  (27, 161)	0.44415944657017076
  (27, 141)	0.3137184164782159
  (27, 129)	0.44415944657017076
  (27, 72)	0.44415944657017076
  (27, 14)	0.44415944657017076
  (28, 168)	0.3216823043466742
  (28, 163)	0.3216823043466742
  (28, 140)	0.2620771940324509
 

In [99]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.5,stratify=Y,random_state=2)

In [100]:
model = LogisticRegression()

In [101]:
model.fit(X_train,Y_train)

LogisticRegression()

In [102]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [103]:
training_data_accuracy

0.6666666666666666

In [104]:
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [105]:
testing_data_accuracy

0.7333333333333333

In [111]:
X_new = X_test[0]

prediction = model.predict(X_new)
print(prediction)

if(prediction[0]==0):
  print('The news is real')
else:
  print('The news is Fake')

[0]
The news is real


In [107]:
print(Y_test[0])

0
