In [None]:
from tensorflow.keras.callbacks import TensorBoard, CSVLogger
from  tensorflow.keras.preprocessing.text import text_to_word_sequence
from  tensorflow.keras.preprocessing import sequence
from  tensorflow.keras.preprocessing.text import Tokenizer
from  tensorflow.keras.models import Sequential
from  tensorflow.keras.layers import Dense,Flatten,LSTM,Conv1D,GlobalMaxPool1D,Dropout,Bidirectional
from  tensorflow.keras.layers import Embedding
from  tensorflow.keras import optimizers
from  tensorflow.keras.layers import Input
from  tensorflow.keras.models import Model
from  tensorflow.keras.utils import plot_model
from  tensorflow.keras.models import load_model
from tensorflow.keras.layers import BatchNormalization
import pandas as pd

In [None]:
#####

# Read in dataset and adjust it for two class setting

#####
liar_df_train = pd.read_csv('liar_dataset/train.tsv',sep='\t',header=None)
liar_df_val = pd.read_csv('liar_dataset/valid.tsv',sep='\t',header=None)
liar_df_test = pd.read_csv('liar_dataset/test.tsv',sep='\t',header=None)
columns = ['ID','label','text','subjects','speaker','job','state','party','barely_true_count',
           'false_count','half_true_count','mostly_true_count','pants_on_fire_count','location']
liar_df_train.columns = columns
liar_df_val.columns = columns
liar_df_test.columns = columns

liar_df_train = liar_df_train.drop(labels=['ID'],axis=1)
liar_df_test = liar_df_test.drop(labels=['ID'],axis=1)
liar_df_val = liar_df_val.drop(labels=['ID'],axis=1)
liar_list = [liar_df_train,liar_df_val,liar_df_test]
#truth=  {"pants-fire" : 0, "false" : 1, "barely-true" : 2, "half-true" : 3, "mostly-true" : 4, "true" : 5}
truth=  {"pants-fire" : 0, "false" : 0, "barely-true" : 0, "half-true" : 1, "mostly-true" : 1, "true" : 1}

for i in liar_list:
    i['numer_truth'] = i['label'].apply(lambda x: truth[x])

In [None]:
liar_df_train['numer_truth'].value_counts()

In [None]:
####

# Define functions for data augmentation

####

import names
import en_core_web_sm
import numpy as np


nlp = en_core_web_sm.load()
replacement_names = [names.get_full_name() for _ in range(50)]


def change_person(x):
    
    x=nlp(x)
    person_names = [ent.text for ent in x.doc.ents if ent.label_ == "PERSON"]
    if person_names:
        name_to_replace = np.random.choice(person_names)
        replacement_name = np.random.choice(replacement_names)
        return x.text.replace(name_to_replace, replacement_name)


def swap_adjectives(x):
    x=nlp(x)
    adjective_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "ADJ"]
    if len(adjective_idxs) >= 2:
        idx1, idx2 = sorted(np.random.choice(adjective_idxs, 2, replace=False))
        return " ".join(
            [
                x.doc[:idx1].text,
                x.doc[idx2].text,
                x.doc[1 + idx1 : idx2].text,
                x.doc[idx1].text,
                x.doc[1 + idx2 :].text,
            ]
        )
    
import nltk
from nltk.corpus import wordnet as wn

nltk.download("wordnet")

def replace_adjective_with_synonym(x):
    # Get indices of adjective tokens in sentence.
    x=nlp(x)
    adjective_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "ADJ"]
    if adjective_idxs:
        # Pick random adjective idx to replace.
        idx = np.random.choice(adjective_idxs)
        synonym = get_synonym(x.doc[idx].text, pos="a")
        # If there's a valid adjective synonym, replace it. Otherwise, return None.
        if synonym:
            #x.text = replace_token(x.doc, idx, synonym)
            return replace_token(x.doc, idx, synonym)

        
##### Replace noun function eventually not used in this project

#def replace_noun_with_synonym(x):
#    # Get indices of noun tokens in sentence.
#    x=nlp(x)
#    noun_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "NOUN"]
#    if noun_idxs:
#        # Pick random noun idx to replace.
#        idx = np.random.choice(noun_idxs)
#        synonym = get_synonym(x.doc[idx].text, pos="n")
#        # If there's a valid noun synonym, replace it. Otherwise, return None.
#        if synonym:
#            #x.text = replace_token(x.doc, idx, synonym)
#            return replace_token(x.doc, idx, synonym)

def replace_verb_with_synonym(x):
    # Get indices of verb tokens in sentence.
    x=nlp(x)
    verb_idxs = [i for i, token in enumerate(x.doc) if token.pos_ == "VERB"]
    if verb_idxs:
        # Pick random verb idx to replace.
        idx = np.random.choice(verb_idxs)
        synonym = get_synonym(x.doc[idx].text, pos="v")
        # If there's a valid verb synonym, replace it. Otherwise, return None.
        if synonym:
            #x.text = replace_token(x.doc, idx, synonym)
            return replace_token(x.doc, idx, synonym)


def get_synonym(word, pos=None):
    """Get synonym for word given its part-of-speech (pos)."""
    synsets = wn.synsets(word, pos=pos)
    # Return None if wordnet has no synsets (synonym sets) for this word and pos.
    if synsets:
        words = [lemma.name() for lemma in synsets[0].lemmas()]
        if words[0].lower() != word.lower():  # Skip if synonym is same as word.
            # Multi word synonyms in wordnet use '_' as a separator e.g. reckon_with. Replace it with space.
            return words[0].replace("_", " ")

def replace_token(spacy_doc, idx, replacement):
    """Replace token in position idx with replacement."""
    return " ".join([spacy_doc[:idx].text, replacement, spacy_doc[1 + idx :].text])

In [None]:
#### Apply data augmentation techniques to training set

liar_df_train['adjectives']=liar_df_train['text'].apply(swap_adjectives)
liar_df_train['change_person']=liar_df_train['text'].apply(change_person)
liar_df_train['replace_verb_with_synonym']=liar_df_train['text'].apply(replace_verb_with_synonym)
liar_df_train['replace_adjective_with_synonym']=liar_df_train['text'].apply(replace_adjective_with_synonym)

In [None]:
#### Save augmented data in temporary files

temp=liar_df_train.copy()

temp1=temp[temp['adjectives'].notna()][['adjectives','numer_truth']].reset_index().drop('index',axis=1).rename(
    columns={'adjectives':'text'})


temp2 =temp[temp['change_person'].notna()][['change_person','numer_truth']].reset_index().drop('index',axis=1).rename(
    columns={'change_person':'text'})


temp3 = temp[temp['replace_verb_with_synonym'].notna()][['replace_verb_with_synonym','numer_truth']].reset_index().drop('index',axis=1).rename(
    columns={'replace_verb_with_synonym':'text'})


temp4 = temp[temp['replace_adjective_with_synonym'].notna()][['replace_adjective_with_synonym','numer_truth']].reset_index().drop('index',axis=1).rename(
    columns={'replace_adjective_with_synonym':'text'})

len(temp1)+len(temp2)+len(temp3)+len(temp4)

In [None]:
#### Merge original training set and augmented data from temp files

temp = pd.concat([temp1, temp2,temp3,temp4])

liar_df_train = pd.concat([temp,liar_df_train])

liar_df_train.numer_truth.value_counts()

In [None]:
#### Tokenize training data and create dictionary

def load_statement_vocab_dict(train_data):
    vocabulary_dict = {}  
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_data['text'])
    vocabulary_dict = tokenizer.word_index
    print(len(vocabulary_dict))
    return vocabulary_dict

vocabulary_dict = load_statement_vocab_dict(liar_df_train)

In [None]:
#### Define function to remove stopwords and prepare text sequences

def preprocess_statement(statement):
    from nltk.corpus import stopwords
    statement = [w for w in statement.split(' ') if w not in stopwords.words('english')]
    statement = ' '.join(statement)
    text = text_to_word_sequence(statement)  
    val = [0] * 10
    val = [vocabulary_dict[t] for t in text if t in vocabulary_dict] 
    return val

In [None]:
#### Apply preprocessing function to all statements

liar_df_train['word_id'] = liar_df_train['text'].apply(preprocess_statement)
liar_df_val['word_id'] = liar_df_val['text'].apply(preprocess_statement)
liar_df_test['word_id'] = liar_df_test['text'].apply(preprocess_statement)

In [None]:
#### Save resulting datasets in pickle files

import pickle

pickle.dump(liar_df_val,open('augmemted_val.pkl','wb'))
pickle.dump(liar_df_test,open('augmemted_test.pkl','wb'))
pickle.dump(liar_df_train,open('augmemted_train2.pkl','wb'))