In [5]:
# we are going to make fake news recognizer using logistic regression model
import numpy as np
import pandas as pd
import re 
import nltk
# removes stop words - words that have no meaning
from nltk.corpus import stopwords
# removes prefixes and suffixes and returns the root word
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
# run it if nltk does not work:

import ssl

ssl._create_default_https_context = ssl._create_unverified_context

In [7]:
data = pd.read_csv("news.csv")

In [8]:
data.shape

(20800, 5)

In [9]:
print(data.head())

   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr...      1  
4  Print \nAn Iranian woman has been sentenced to...      1  


In [10]:
data

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maxkucher/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# list of stop words - words that have minimal semantic meaning

print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [13]:
# number of missing values ​​in each column
data.isnull().sum()



id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [14]:
# replace empty, zero values ​​with empty strings
data = data.fillna('')


In [15]:
# no more empty values
data.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [16]:
# we will use title and author instead of text to predict whether the text is true or not
# let's combine the title and author arrays into one (we'll also leave problems between them)

data["content"] = data["title"] + " " + data["author"]

In [17]:

# display the newly created column
print(data["content"])

0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        FLYNN: Hillary Clinton, Big Woman on Campus - ...
2        Why the Truth Might Get You Fired Consortiumne...
3        15 Civilians Killed In Single US Airstrike Hav...
4        Iranian woman jailed for fictional unpublished...
                               ...                        
20795    Rapper T.I.: Trump a ’Poster Child For White S...
20796    N.F.L. Playoffs: Schedule, Matchups and Odds -...
20797    Macy’s Is Said to Receive Takeover Approach by...
20798    NATO, Russia To Hold Parallel Exercises In Bal...
20799              What Keeps the F-35 Alive David Swanson
Name: content, Length: 20800, dtype: object


In [18]:
# separate the labels (what we will predict) and the rest of the data


# In the Pandas drop method, the axis parameter specifies which axis to perform the operation on.
# In this case, axis=1 means you are removing the column.
x = data.drop(columns="label", axis=1)
y = data["label"]


In [19]:
x

Unnamed: 0,id,title,author,text,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Why the Truth Might Get You Fired Consortiumne...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Iranian woman jailed for fictional unpublished...
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,Rapper T.I.: Trump a ’Poster Child For White S...
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,"N.F.L. Playoffs: Schedule, Matchups and Odds -..."
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,Macy’s Is Said to Receive Takeover Approach by...
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...","NATO, Russia To Hold Parallel Exercises In Bal..."


In [20]:
y

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64

In [21]:
# now we will use Stemming - this is the process of processing text data
# for the purpose of highlighting the stem (or root) of a word.

stem = PorterStemmer()

In [22]:
# function for stemming
def stemming(content):
    # This line uses a regular expression (re) to replace all non-English characters with a space.
    # So it removes all non-letter characters.
    stemmed = re.sub('[^a-zA-z]', ' ', content) 
    # Converts all letters in the text to lowercase.
    stemmed = stemmed.lower()
    # Separates text into words and puts them in a list..
    stemmed = stemmed.split()
    # This list inclusion performs stemming for every word in the list, excluding stop words
    # (words that usually do not carry meaning and can be excluded).
    # The stemmer from the NLTK library is used.
    stemmed = [stem.stem(word) for word in stemmed if not word in stopwords.words("english")]
    # Collects stemmed words back into a string, separating them with spaces.
    stemmed = ' '.join(stemmed)
    return stemmed

In [23]:
# apply the stemming function to all objects inside the content array
data['content'] = data['content'].apply(stemming)

In [24]:
# output stemming results
print(data["content"])

0        hous dem aid even see comey letter jason chaff...
1        flynn hillari clinton big woman campu breitbar...
2                   truth might get fire consortiumnew com
3        civilian kill singl us airstrik identifi jessi...
4        iranian woman jail fiction unpublish stori wom...
                               ...                        
20795    rapper trump poster child white supremaci jero...
20796    n f l playoff schedul matchup odd new york tim...
20797    maci said receiv takeov approach hudson bay ne...
20798    nato russia hold parallel exercis balkan alex ...
20799                            keep f aliv david swanson
Name: content, Length: 20800, dtype: object


In [25]:
# We divide the data into features and labels. As signs we use the content array, which was previously processed
# stemming function
x = data["content"].values
y = data["label"].values

In [26]:
print(x)
print(y)
# explanation for array y: 1 - fake news, 0 - real

['hous dem aid even see comey letter jason chaffetz tweet darrel lucu'
 'flynn hillari clinton big woman campu breitbart daniel j flynn'
 'truth might get fire consortiumnew com' ...
 'maci said receiv takeov approach hudson bay new york time michael j de la merc rachel abram'
 'nato russia hold parallel exercis balkan alex ansari'
 'keep f aliv david swanson']
[1 0 1 ... 0 1 1]


In [27]:
y.shape

(20800,)

In [28]:
x.shape

(20800,)

In [29]:
# next we convert text data to numbers 
vect = TfidfVectorizer()
# this is a class from the scikit-learn library designed to transform a collection of text documents
# into a feature matrix based on the TF-IDF (Term Frequency-Inverse Document Frequency) method.
# TF-IDF is a statistical measure used to evaluate the importance of a word in the context of a collection of documents.

vect.fit(x)

x = vect.transform(x)

In [30]:
print(x)

  (0, 15697)	0.28485063562728646
  (0, 13480)	0.2565896679337957
  (0, 8915)	0.3635963806326075
  (0, 8636)	0.29212514087043684
  (0, 7698)	0.24785219520671603
  (0, 7012)	0.21874169089359144
  (0, 4979)	0.233316966909351
  (0, 3795)	0.2705332480845492
  (0, 3603)	0.3598939188262559
  (0, 2962)	0.2468450128533713
  (0, 2485)	0.3676519686797209
  (0, 268)	0.27010124977708766
  (1, 16812)	0.30071745655510157
  (1, 6823)	0.1904660198296849
  (1, 5510)	0.7143299355715573
  (1, 3571)	0.26373768806048464
  (1, 2816)	0.19094574062359204
  (1, 2224)	0.3827320386859759
  (1, 1895)	0.15521974226349364
  (1, 1498)	0.2939891562094648
  (2, 15622)	0.41544962664721613
  (2, 9625)	0.49351492943649944
  (2, 5975)	0.3474613386728292
  (2, 5396)	0.3866530551182615
  (2, 3106)	0.46097489583229645
  :	:
  (20797, 13128)	0.2482526352197606
  (20797, 12350)	0.27263457663336677
  (20797, 12144)	0.24778257724396507
  (20797, 10311)	0.08038079000566466
  (20797, 9593)	0.174553480255222
  (20797, 9523)	0.295420

In [31]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)


# stratify=y
# This option is used when partitioning data containing class labels (such as classification tasks).
# If stratify is set to y (where y is a vector of class labels), then the division will be done like this:
# so that the distribution of classes in the training and test sets is approximately the same as in the original data set.

# random_state=42:
# This parameter sets the seed for the random number generator.
# Setting random_state to a specific value, such as 42, ensures data splitting is reproducible.
# Each time the program is run with the same random_state value, the data will be split in the same way.

In [32]:
# we are going to use logistic regression methom in our model

model = LogisticRegression()

model.fit(x_train, y_train)

In [33]:
model.score(x_test, y_test)

0.9752403846153846

In [34]:
model.score(x_train, y_train)

0.9874399038461539

In [35]:
predicted_value = x_test[2]
predictions = model.predict(predicted_value)

In [36]:
# compare predictions and real results
print(f"Predicted value: {predictions}")
print(f"Actual value: {y_test[2]}")


Predicted value: [1]
Actual value: 1


In [37]:

# results output
actuals = y_test[2]
if actuals == 1 and predictions == 1:
    actuals == "Fake"
    predictions = "Fake"
    print(f"Actual value is {actuals}, predicted value is {predictions}")
elif actuals == 0 and predictions == 0:
    actuals == "Real"
    predictions = "Real"
    print(f"Actual value is {actuals}, predicted value is {predictions}")
# print(predictions)

Actual value is 1, predicted value is Fake
