# Fake news detekcija

### Importovanje

In [23]:
import pandas as pd
import random
import re
import nltk
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import Sequential
from keras.layers import SpatialDropout1D, Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import matplotlib as plt


nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krick\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Ucitavanje i spajanje csv fajlova sa dodatim FAKE atributom da se razlikuje izvor

In [2]:
true_df = pd.read_csv('../datasets/fake-news/True.csv')
fake_df = pd.read_csv('../datasets/fake-news/Fake.csv')

true_df = true_df.assign(fake = [False for _ in true_df.iterrows()])
fake_df = fake_df.assign(fake = [True for _ in fake_df.iterrows()])

df = pd.concat([true_df, fake_df])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 0 to 23480
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   fake     44898 non-null  bool  
dtypes: bool(1), object(4)
memory usage: 1.8+ MB


### Provera koliko postoji null vrednosti <b>subject</b>
#### Bitno nam je da se broj mali, jer ce igrati ulogu u daljoj klasifikaciji!

In [3]:
print(df.isnull().sum())

title      0
text       0
subject    0
date       0
fake       0
dtype: int64


# Obrada teksta

### Koliko je prljav tekst?

In [4]:
def print_plot(index):
    example = df[df.index == index][['text', 'fake']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Fake:', example[1])

def print_cell():
    index = random.randint(0, df.shape[0])
    print_plot(index)

In [7]:
def safe_print_cell():
    try:
        print_cell()
    except IndexError:
        print_cell()

safe_print_cell()

Fake: False


#### Tekst je relativno prljav. Konkretno, sadrzi karaktere poput [], () i sl.
1. Konvertujemo tekstove u lower case
2. Ukljanjamo stop reci
3. Izbacujemo numericke oznake iz teksta

In [59]:
df = df.reset_index(drop=True)

def clean_text(text: str) -> str:
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    STOPWORDS = set(stopwords)
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text


#### Cistimo tekst svih clanaka

In [9]:
df['text'] = df['text'].apply(clean_text)
df['text'] = df['text'].str.replace('\d+', '')


  


#### Provera nove verzije teksta

In [10]:
safe_print_cell()

vp mike pence pappas bbq houston pictwittercom occwnodkx edith gonzalez edithge_ february   watch crowd towards end cheering president bush barbara bush hear loud cheer vp pence https wwwyoutubecom watchvkfybtggi
Fake: True


# LSTM

#### Inicijalna podesavanja parametara i tokenizacija tekstova clanaka

In [11]:
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 209040 unique tokens.


#### Bitno je da svi inputi budu istih dimenzija, pa dodajemo padding

In [12]:
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (44898, 250)


#### Konvertovanje bool oznaka u numericke

In [13]:
Y = pd.get_dummies(df['fake']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (44898, 2)


#### Train test

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(35918, 250) (35918, 2)
(8980, 250) (8980, 2)


#### Kreiranje modela

In [17]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


#### Evaluacija modela

In [18]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.067
  Accuracy: 0.982


#### Serializacija modela

In [22]:
model.save('../models/fake_news')

INFO:tensorflow:Assets written to: ../models/fake_news\assets


INFO:tensorflow:Assets written to: ../models/fake_news\assets


# Testiranje na novom skupu podataka

In [69]:
new_df = pd.read_csv('../datasets/fake-news/news_articles.csv')
new_df = new_df.assign(fake = [True for _ in new_df.iterrows()])
new_df = new_df[['text', 'fake']]
new_true = pd.read_csv('../datasets/fake-news/articles1.csv')
new_true = new_true.head(len(new_df.index))
new_true = new_true.assign(fake = [False for _ in new_df.iterrows()])
new_true = new_true[['content', 'fake']]
new_true.rename(columns = {'content':'text'}, inplace = True)
new_true.info()

new_df = pd.concat([new_df, new_true])
new_df = new_df.dropna()

new_df = new_df.reset_index(drop=True)

new_df['text'] = new_df['text'].apply(clean_text)
new_df['text'] = new_df['text'].str.replace('\d+', '')

tokenizer_new = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer_new.fit_on_texts(new_df['text'].values)
new_word_index = tokenizer_new.word_index
print('Found %s unique tokens.' % len(new_word_index))

X_new = tokenizer_new.texts_to_sequences(new_df['text'].values)
X_new = pad_sequences(X_new, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_new.shape)

Y_new = pd.get_dummies(new_df['fake']).values
print('Shape of label tensor:', Y_new.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2096 entries, 0 to 2095
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2096 non-null   object
 1   fake    2096 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 18.5+ KB




Found 78353 unique tokens.
Shape of data tensor: (4146, 250)
Shape of label tensor: (4146, 2)


In [30]:
def print_plot(index):
    example = new_df[new_df.index == index][['text', 'fake']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Fake:', example[1])

def print_cell():
    index = random.randint(0, new_df.shape[0])
    print_plot(index)

print_cell()

Schools All Over America Are Closing On Election Day Due To Fears Of Violence By Michael Snyder, on October 27th, 2016 
Will this be the most chaotic election day in modern American history? All across the nation, schools are being closed on election day due to safety fears. Traditionally, schools have been very popular as voting locations because they can accommodate a lot of people, they usually have lots of parking, and everyone in the community knows where they are and can usually get to them fairly easily. But now there is a big movement to remove voting from schools or to shut schools down on election day so that children are not present when voting takes place. According to Fox News , “voting has been removed or classes have been canceled on Election Day at schools in Illinois, Maine, Nebraska, New Hampshire, Ohio, Pennsylvania, Wisconsin and elsewhere.” Just a couple days ago , I shared with you a survey that found that 51 percent of all Americans are concerned about violence h

#### Evaluacija

In [70]:
accr_new = model.evaluate(X_new,Y_new)
print('Validation set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr_new[0],accr_new[1]))

Test set
  Loss: 2.080
  Accuracy: 0.592
