### Import Dependencies

In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
import keras
from keras.models import Model
import keras.backend as K
from keras.models import load_model
from tensorflow.keras.layers import Dropout, Input,Dense
from tensorflow.keras import regularizers
from transformers import *
from transformers import BertTokenizer, TFBertForSequenceClassification, DistilBertConfig,DistilBertModel
import numpy as np

import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import tensorflow as tf

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/julius_riel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [88]:
DistiledBertTokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
DistiledBert = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'activation_13', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


### Loading Data

In [89]:
real = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')

#### Concatonation of data in real.csv and fake.csv, with addition of labels

In [90]:
fake["labels"] = 1
real["labels"] = 0

Dataset = [fake,real]
Dataset = pd.concat(Dataset)

#### Splitting of data into features (x) and labels (y)

In [91]:
x = Dataset.drop("labels", axis = 1)
y = Dataset.labels

#### Adding of title to the body of text

In [92]:
x["text"] = x["title"] + " " + x["text"]
x = x.drop("title", axis = 1)

#### Removal of HTML

In [93]:
def remove_tags(string):
    result = BeautifulSoup(string, "html.parser")#re.sub('<.*?>','',string)
    result = result.get_text()
    return result

#### Removal of special characters & punctuation

In [94]:
def remove_special_characters(string):
    result = re.sub('\[[^]]*\]', ' ', string)
    result = re.sub('[^a-zA-Z]',' ', string)
    return result

#### Tokenization of text

In [95]:
def tokenize(string,max_length):
    bert_inp=DistiledBertTokenizer.encode_plus(string,add_special_tokens = True,max_length =max_length,pad_to_max_length = True,return_attention_mask = True)
    input_ids = (bert_inp['input_ids'])
    attentionmasks = (bert_inp['attention_mask'])
    
    return input_ids, attentionmasks

#### Removal of Stopwords

In [96]:
def remove_stopwords(tokenized_text):
    stop_words = set(stopwords.words('english'))
    filtered = []
    for i in tokenized_text:
        if i not in stop_words:
            filtered.append(i)
    return filtered

#### Applying Data preperation proccesses

In [97]:
x["text"] = x["text"].apply(lambda x: "'" + x + "'")
x["text"] = x['text'].str.lower()
x['text'] = x['text'].apply(lambda f : remove_tags(f))
x['text'] = x['text'].apply(lambda f : remove_special_characters(f))
#x['text'] = x.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
#x['text'] = x['text'].apply(lambda f: remove_stopwords(f))
input_id = []
attention_mask = []
max_length = 512
for i in x['text']:
    inpt, attention = tokenize(i,max_length)
    input_id.append(inpt)
    attention_mask.append(attention)

input_id = np.asarray(input_id)
attention_mask = np.array(attention_mask)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


#### Splitting dataset into train and test

In [100]:
id_train,id_test,y_train,y_test,attentionmask_train,attentionmask_test=train_test_split(input_id,y,attention_mask,test_size=0.2)

### Modeling

#### Adding a simple Neural Network to use the outputted to classify

In [103]:
ind = Input(shape = (max_length,), dtype='int64')
attmasks = Input(shape = (max_length,), dtype='int64')
number_of_classes = 2

DistiledBERTLayer = DistiledBert(ind, attention_mask=attmasks)[0][:,0,:]
dense_layer = Dense(512,activation='relu',kernel_regularizer=regularizers.l2(0.01))(DistiledBERTLayer)
dropout_layer= Dropout(0.5)(dense_layer)
Output = Dense(number_of_classes, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout_layer)
model = tf.keras.Model(inputs=[ind,attmasks], outputs=Output)



#### Compiling model

In [105]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

#### Training

In [109]:
hist=model.fit([id_train,attentionmask_train],y_train,batch_size=32,epochs=1, validation_split = 0.1)



In [None]:
predictions = model.predict([id_test, attentionmask_test])