<a href="https://colab.research.google.com/github/Matis-Trump/Matis-Trump/blob/main/Fake_news_detection_with_logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Dependencies

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
#Downlaod stopwords

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Printing the stopwords
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data Preprocessing

In [4]:
# Import the dataset
news_df = pd.read_csv('/content/drive/MyDrive/MY ML PROJECTS/WELFake_Dataset.csv')

In [5]:
#Reducing the dataset by 50% (the original dataset is too bulky for stemming)
news_df = news_df.sample(frac=0.1, random_state=42)

In [6]:
news_df.shape


(7213, 4)

In [7]:
news_df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
61370,61370,ARNOLD SCHWARZENEGGER Sends A Message To Liber...,,1
2189,2189,WOW! “We Mexicans Need To Kill Donald Trump Be...,And now a message of peace and unity from one ...,1
60609,60609,Jimmy Carter recovers from dehydration scare i...,"WINNIPEG, Manitoba (Reuters) - Former U.S. Pre...",0
51565,51565,2 Friars’ Mission: Reviving a Brooklyn Church ...,"The two Franciscan friars, complete with rob...",0
39431,39431,Boy With Autism Makes His First Friend Ever An...,Approximately 1 in 68 children has an autism s...,1


In [8]:
news_df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,49
text,5
label,0


In [9]:
#fill the null values
news_df = news_df.fillna('')

In [10]:
news_df['text'] = news_df['title']+ ' ' +news_df['text']

Stemming --> Reducing words to their root words

In [11]:
port_stem = PorterStemmer()

In [12]:
def stemming(text):
  stemmed_text = re.sub('[^a-zA-Z]', ' ', text)
  stemmed_text = stemmed_text.lower()
  stemmed_text = stemmed_text.split()
  stemmed_text = [port_stem.stem(word) for word in stemmed_text if not word in stopwords.words('english')]
  stemmed_text = ' '.join(stemmed_text)
  return stemmed_text

In [13]:
news_df['text'] = news_df['text'].apply(stemming)

In [14]:
print(news_df['text'])

61370    arnold schwarzenegg send messag liber whine tr...
2189     wow mexican need kill donald trump becom presi...
60609    jimmi carter recov dehydr scare canada winnipe...
51565    friar mission reviv brooklyn church religi dea...
39431    boy autism make first friend ever mom stop cri...
                               ...                        
46006    obama condemn trump say u bless muslim commun ...
33197    czech foreign minist lightli injur car accid m...
61712    hous intel slap subpoena mccain institut assoc...
17836    terror group plan violenc trump support shock ...
47596    media blm block street storm footbal field bla...
Name: text, Length: 7213, dtype: object


In [15]:
X = news_df['text']
Y = news_df['label']

Converting the Textual contents to numerical values using vectorizer

In [16]:
#Create a variable and store the vectorizer in it

vectorizer = TfidfVectorizer()

vectorizer.fit(X)
X = vectorizer.transform(X)

In [17]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1373892 stored elements and shape (7213, 49874)>
  Coords	Values
  (0, 2337)	0.5036391898635276
  (0, 24977)	0.2523179278259548
  (0, 27542)	0.25173251353186943
  (0, 38580)	0.5620243596050936
  (0, 39009)	0.2664051242934325
  (0, 44753)	0.13085944238096534
  (0, 46909)	0.18452569390879997
  (0, 48030)	0.42578982485500627
  (1, 1455)	0.05491731318923866
  (1, 1964)	0.1279073104444237
  (1, 2514)	0.12324269542430656
  (1, 3851)	0.20420095929831397
  (1, 4299)	0.08044621369176619
  (1, 5095)	0.1032140203795777
  (1, 5207)	0.17742066586511154
  (1, 5352)	0.11185982371007026
  (1, 6382)	0.04901388292825424
  (1, 9583)	0.20688329924316526
  (1, 11824)	0.2072969755242485
  (1, 13863)	0.06928734426521017
  (1, 14067)	0.10390312579572589
  (1, 17055)	0.10211611269354866
  (1, 17102)	0.09448069106515511
  (1, 20294)	0.09930160053682865
  (1, 20380)	0.08862516342190765
  :	:
  (7212, 44739)	0.05285062754845705
  (7212, 45093)	0.077541

Splitting the dataset into training and test data

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, stratify=Y, random_state=3)

Training the Model: Logistic Regression


In [19]:
model = LogisticRegression()
model.fit(X_train, Y_train)

Model Evaluation

In [20]:
#accuracy score on the training dataset
model.predict(X_train)
trainig_data_accuracy = accuracy_score(model.predict(X_train), Y_train)
print('Accuracy of training data:', trainig_data_accuracy)

Accuracy of training data: 0.9528563505268997


In [21]:
#accuracy score on the test dataset
model.predict(X_test)
trainig_data_accuracy = accuracy_score(model.predict(X_test), Y_test)
print('Accuracy of training data:', trainig_data_accuracy)

Accuracy of training data: 0.9041019955654102


Making a Predictive System

In [22]:
X_new = X_test[999]

prediction = model.predict(X_new)

print(prediction)

if (prediction[0]==0):
  print('The news is false')
else:
  print('The news is real')



print('Confirmation from the original dataset:','\n', Y_test.iloc[999])

[0]
The news is false
Confirmation from the original dataset: 
 0
