In [12]:
import os
import sys 

parent_directory = os.path.dirname(os.path.abspath(os.getcwd()))
sys.path.append(parent_directory)  # this allows for the pipeline to be imported 

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pipeline import clean_text

In [14]:
df = pd.read_csv("fake_and_real_news_dataset.csv")
df.head()

Unnamed: 0,idd,title,text,label
0,Fq+C96tcx+,‘A target on Roe v. Wade ’: Oklahoma bill maki...,UPDATE: Gov. Fallin vetoed the bill on Friday....,REAL
1,bHUqK!pgmv,Study: women had to drive 4 times farther afte...,Ever since Texas laws closed about half of the...,REAL
2,4Y4Ubf%aTi,"Trump, Clinton clash in dueling DC speeches","Donald Trump and Hillary Clinton, now at the s...",REAL
3,_CoY89SJ@K,Grand jury in Texas indicts activists behind P...,A Houston grand jury investigating criminal al...,REAL
4,+rJHoRQVLe,"As Reproductive Rights Hang In The Balance, De...",WASHINGTON -- Forty-three years after the Supr...,REAL


In [15]:
# drop nan
df.dropna(inplace=True, axis=0)
df.head()

Unnamed: 0,idd,title,text,label
0,Fq+C96tcx+,‘A target on Roe v. Wade ’: Oklahoma bill maki...,UPDATE: Gov. Fallin vetoed the bill on Friday....,REAL
1,bHUqK!pgmv,Study: women had to drive 4 times farther afte...,Ever since Texas laws closed about half of the...,REAL
2,4Y4Ubf%aTi,"Trump, Clinton clash in dueling DC speeches","Donald Trump and Hillary Clinton, now at the s...",REAL
3,_CoY89SJ@K,Grand jury in Texas indicts activists behind P...,A Houston grand jury investigating criminal al...,REAL
4,+rJHoRQVLe,"As Reproductive Rights Hang In The Balance, De...",WASHINGTON -- Forty-three years after the Supr...,REAL


In [16]:
x = df["title"]+ "." +df["text"]

y = df["label"].apply(lambda x: 1 if x == "REAL" else 0)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=42)

Lstm_train = x_test.apply(clean_text)

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

full_text_length = x.apply(lambda x: len(x.split()))     # split it into indivdual words, then check

max_length = full_text_length.max()


token = Tokenizer(num_words=10000, oov_token="<OOV>")
token.fit_on_texts(Lstm_train)
word_index = token.word_index

# text processing 
training_seq = token.texts_to_sequences(Lstm_train)

#paqdding 
train_padded = pad_sequences(training_seq,  maxlen=10000,  padding='post')

In [19]:
def addPath(folder_name, list):
    parent_directory = os.path.dirname(os.path.abspath(os.getcwd()))
    folder = os.path.join(parent_directory, folder_name)
    return [os.path.join(folder, i) for i in list]  

In [25]:
import keras

def train_LSTM(x, y):
    chosen_models = ["LSTM_ww_merge.keras", "LSTM_WordNet_merge.keras", "LSTM_Word2Vec_merge.keras", "LSTM_unbalnced_merge.keras"]

    models = addPath("LSTM models", chosen_models)
    
    LSTM = [keras.saving.load_model(i) for i in models]


    for i, v in enumerate(["ww", "wordNet", "word2Vec", "unbalanced"]):
        print(f"Training for: {v}")
        total = 0
        for _ in range(5):
            results = LSTM[i].evaluate(x, y, verbose=0)
            total += results[1]
        print(f"Results :{total /5}")
        print("-"*20)

In [26]:
def train_bert(x, y):
    chosen_models = ["bert_ww_merge.keras", "bert_WordNet_merge.keras", "bert_Word2Vec_merge.keras", "bert_unbalnced_merge.keras"]
    models = addPath("BERT models", chosen_models)
    
    berts = [keras.saving.load_model(i) for i in models]

    for i, v in enumerate(["ww", "wordNet", "word2Vec", "unbalanced"]):
        print(f"Training for: {v}")
        total = 0
        for _ in range(5):
            results = berts[i].evaluate(x, y, verbose=0)
            total += results[1]
        print(f"Results :{total /5}")
        print("-"*20)

In [27]:
print("Testing LSTM")
train_LSTM(train_padded, y_test)

Testing LSTM
Training for: ww
Results :0.5402611494064331
--------------------
Training for: wordNet
Results :0.5331882238388062
--------------------
Training for: word2Vec
Results :0.5239390730857849
--------------------
Training for: unbalanced
Results :0.5315560102462769
--------------------


In [28]:
print("Testing BERT")
train_bert(x_test, y_test)

Testing BERT
Training for: ww
Results :0.4630032777786255
--------------------
Training for: wordNet
Results :0.5152339339256287
--------------------
Training for: word2Vec
Results :0.44124048948287964
--------------------
Training for: unbalanced
Results :0.4080522358417511
--------------------


In [29]:
# base training
import keras_nlp

def bert_base(x, y):
    accuracy = 0
    print("training Bert Base")
    for _ in range(5):
        base = keras_nlp.models.BertClassifier.from_preset(
            "bert_tiny_en_uncased",
            num_classes=2,
        )

        accuracy += base.evaluate(x, y, verbose=0)[1]

    print(f"BERT Base: {accuracy/5}")

bert_base(x_test, y_test)

training Bert Base
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 118ms/step - loss: 0.6791 - sparse_categorical_accuracy: 0.5191
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 117ms/step - loss: 0.6938 - sparse_categorical_accuracy: 0.5172
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 122ms/step - loss: 0.7008 - sparse_categorical_accuracy: 0.4866
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 118ms/step - loss: 0.6999 - sparse_categorical_accuracy: 0.5221
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 122ms/step - loss: 0.6958 - sparse_categorical_accuracy: 0.4720
BERT Base: 0.5006528675556183
