In [6]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
# printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data Preprocessing

In [28]:
import pandas as pd

# Attempt to read the CSV file, handling potential errors
try:
    dataset = pd.read_csv('/content/train.csv', engine='python')
except pd.errors.ParserError:
    # If ParserError occurs, try specifying 'error_bad_lines=False'
    # to skip problematic rows
    dataset = pd.read_csv('/content/train.csv', engine='python', error_bad_lines=False)
    print("Warning: Skipped problematic rows due to parsing errors.")

# If encoding issues are suspected, try specifying the encoding:
# dataset = pd.read_csv('/content/train.csv', engine='python', encoding='latin-1') # or 'utf-8', 'iso-8859-1', etc.

# If the delimiter is not a comma, specify it using the 'sep' argument:
# dataset = pd.read_csv('/content/train.csv', engine='python', sep='\t') # for tab-separated values

# Inspect the data for inconsistencies:
print(dataset.head())  # View the first few rows to check for issues
print(dataset.tail())  # View the last few rows
print(dataset.isnull().sum())  # Check for missing values in each column

                                                text label
0  The court granted by a 5-4 vote a request made...  real
1  " Pennsylvania was a crucial swing state in th...  real
2  The company today is rolling out an update to ...  fake
3  When it comes to trade policy, Hillary Clinton...  real
4  S. stocks had their worst April start since 19...  real
                                                     text label
231998  Cipher Pharmaceuticals Inc * Cipher pharmaceut...  real
231999  Lyft and larger rival Uber face separate lawsu...  real
232000  Six-month Sterling Overnight Index Average (SO...  real
232001  Going on Museum Hack's Badass Bitches tour -- ...  fake
232002  "We will continue to fight on behalf of all wo...  real
text     0
label    0
dtype: int64


In [29]:
dataset.head()

Unnamed: 0,text,label
0,The court granted by a 5-4 vote a request made...,real
1,""" Pennsylvania was a crucial swing state in th...",real
2,The company today is rolling out an update to ...,fake
3,"When it comes to trade policy, Hillary Clinton...",real
4,S. stocks had their worst April start since 19...,real


In [30]:
dataset.shape

(232003, 2)

In [14]:
dataset.isnull().sum()

Unnamed: 0,0
text,0
label,0


In [31]:
dataset['label'] = dataset['label'].replace({'fake':1,'real':0})

  dataset['label'] = dataset['label'].replace({'fake':1,'real':0})


In [39]:
# separating the dataset
X = dataset['text']
Y = dataset['label']

In [40]:
Y

Unnamed: 0,label
0,0
1,0
2,1
3,0
4,0
...,...
231998,0
231999,0
232000,0
232001,1


In [37]:
X

Unnamed: 0,text
0,The court granted by a 5-4 vote a request made...
1,""" Pennsylvania was a crucial swing state in th..."
2,The company today is rolling out an update to ...
3,"When it comes to trade policy, Hillary Clinton..."
4,S. stocks had their worst April start since 19...
...,...
231998,Cipher Pharmaceuticals Inc * Cipher pharmaceut...
231999,Lyft and larger rival Uber face separate lawsu...
232000,Six-month Sterling Overnight Index Average (SO...
232001,Going on Museum Hack's Badass Bitches tour -- ...


In [35]:
dataset.head()

Unnamed: 0,text,label
0,The court granted by a 5-4 vote a request made...,0
1,""" Pennsylvania was a crucial swing state in th...",0
2,The company today is rolling out an update to ...,1
3,"When it comes to trade policy, Hillary Clinton...",0
4,S. stocks had their worst April start since 19...,0


Stemming:

Stemming is the process of reducing a word to its Root word

example: actor, actress, acting --> act

In [41]:
port_stem = PorterStemmer()

In [42]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [43]:
dataset['text'] = dataset['text'].apply(stemming)

In [44]:
dataset['text']

Unnamed: 0,text
0,court grant vote request made state busi group...
1,pennsylvania crucial swing state elect trump r...
2,compani today roll updat android oper system i...
3,come trade polici hillari clinton take heat si...
4,stock worst april start sinc big pictur presid...
...,...
231998,cipher pharmaceut inc cipher pharmaceut announ...
231999,lyft larger rival uber face separ lawsuit brou...
232000,six month sterl overnight index averag sonia s...
232001,go museum hack badass bitch tour guerrilla gir...


In [45]:
#separating the data and label
X = dataset['text'].values
Y = dataset['label'].values

In [46]:
X

array(['court grant vote request made state busi group block administr clean power plan requir state develop plan reduc greenhous ga emiss energi sector court rule prevent regul implement litig continu whether rule permiss feder law vote split along ideolog line',
       'pennsylvania crucial swing state elect trump republican democrat oppon hillari clinton trump end win state nearli million vote night shapiro democrat vote trump becom state attorney gener shapiro said polit consider play role decis help coordin',
       'compani today roll updat android oper system includ number new featur improv updat call android lollipop avail download googl play store android market come varieti improv includ new camera app let zoom camera option turn app use updat also add support appl new face id',
       ...,
       'six month sterl overnight index averag sonia swap rate reflect market expect interest rate hit highest sinc june day brexit vote result basi point recent warn market price potenti 

In [48]:
Y.shape

(232003,)

In [49]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [51]:
print(X)

  (0, 1044)	0.12656659529118305
  (0, 2776)	0.16134747163443747
  (0, 9985)	0.16206232915710422
  (0, 12504)	0.11495967401022135
  (0, 16496)	0.18550774527332198
  (0, 17964)	0.11190239785241972
  (0, 18548)	0.24288785343527128
  (0, 21700)	0.12192672439010888
  (0, 25695)	0.1768088113328282
  (0, 25948)	0.13275385037337856
  (0, 28531)	0.1177380290627748
  (0, 31388)	0.14354340147999967
  (0, 34281)	0.17429276730308652
  (0, 34421)	0.20035242697401295
  (0, 34710)	0.10302891919018839
  (0, 39809)	0.23104586049810696
  (0, 40440)	0.17330238938411155
  (0, 48830)	0.1264633395607164
  (0, 50105)	0.139658680263127
  (0, 50310)	0.20660401581715662
  (0, 51863)	0.122672362683997
  (0, 66150)	0.2129728001586116
  (0, 67298)	0.21238078884427694
  (0, 68285)	0.12637278971944121
  (0, 68787)	0.1576815942874815
  :	:
  (232001, 86097)	0.1710651479099903
  (232001, 88178)	0.20701191235706318
  (232002, 826)	0.2561397591837281
  (232002, 894)	0.15247739030960866
  (232002, 2843)	0.0973226726037529

Splitting the dataset to training & test data



In [52]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [53]:
X_train.shape

(185602, 99319)

Training the Model: Logistic Regression

In [54]:
model = LogisticRegression()

In [55]:
model.fit(X_train, Y_train)

Evaluation

accuracy score

In [56]:
#accurancy score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [58]:
print('Accuracy score of training data : ', training_data_accuracy)

Accuracy score of training data :  0.9208090430060021


In [59]:
#accurancy score in test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [60]:
print('Accuracy score of test data : ', test_data_accuracy)

Accuracy score of test data :  0.9118553479450874


Making a Predictive System

In [62]:
x_test = X_test[2]

prediction = model.predict(x_test)
print(prediction)

if (prediction[0] == 0):
  print('The news is Real')
else:
  print('The news is Fake')

[1]
The news is Fake


In [66]:
if Y_test[2] == 0:
  print('The news is Real')
else:
  print('The news is Fake')

The news is Fake
