In [1]:

# Core
import re

#Data Manipulation 
import pandas as pd

# Preprocessing Tools
from sklearn.pipeline import Pipeline


# NLP Tools
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords



# Neural Networks
import tensorflow as tf
import keras



# Ignore any warnings
import warnings;
warnings.filterwarnings('ignore')

In [2]:
from transformers import BertTokenizer, TFBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import FunctionTransformer

In [3]:

import pandas as pd

col_names = [
        'id',               # Column 1: the ID of the statement ([ID].json).
        'label',            # Column 2: the label.
        'statement',        # Column 3: the statement.
        'subjects',         # Column 4: the subject(s).
        'speaker',          # Column 5: the speaker.
        'speaker_job_title', # Column 6: the speaker's job title.
        'state_info',       # Column 7: the state info.
        'party_affiliation', # Column 8: the party affiliation.
        
        'barely_true', # barely true counts.
        'false', # false counts.
        'half_true', # half true counts.
        'mostly_true', # mostly true counts.
        'pants_on_fire', # pants on fire counts.
        
        'context' # Column 14: the context (venue / location of the speech or statement).
    ]

def read_df(tsv_file: str) -> pd.DataFrame:
    df = pd.read_csv(tsv_file, delimiter='\t', dtype=object)
    # replaces all "null" or "NaN" values with an empty string
    df.fillna("", inplace=True)
    # labels the columns in the dataset using the data dictionary described in the README
    df.columns = col_names
    df = df.dropna(subset=["label", "statement"])
    df.drop(["id"], axis=1, inplace=True)
    return df

train_df = read_df('datasets/train.tsv')
test_df = read_df('datasets/test.tsv')
valid_df = read_df('datasets/valid.tsv')


In [4]:

def reset_index(df):
    df = df.reset_index(drop=True)
    return df

def drop_na(df):
    df = df.dropna(subset=['input'], axis=0)
    return df

def drop_duplicated(df):
    df = df.drop_duplicates()
    return df


def label(df):
    df['output'] = df['label'].map({'true': 1,
                                     'mostly-true': 1,
                                     'half-true': 1,
                                     'false': 0,
                                     'barely-true': 0,
                                     'pants-fire': 0}).astype(int)
    return df


def clean_text(text):
    if not isinstance(text, str):
        return ''
    processed_text = text.lower()
    
    processed_text=re.sub(re.compile('[/(){}\[\]\|@,;]'),' ',processed_text)
    processed_text=re.sub(re.compile('[^0-9a-z #+_]'),' ',processed_text)
    
     # Tokenization
    words = word_tokenize(processed_text)
    
    # Lemmatize and stem each word
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stemmed_words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words]
    
    # Stop word removal
    stop_words = set(stopwords.words('english'))
    stop_words.update(['say', 'percent', 'state', 'year', 'said', 'people', 'one'])
    filtered_words = [word for word in stemmed_words if word.lower() not in stop_words]
    # Combine words back into a sentence
    processed_text = ' '.join(filtered_words)
    
    return processed_text

def clean_df(df):
    df['input'] = df['input'].apply(lambda x: clean_text(x))
    return df


In [5]:
cleaning_pipeline = Pipeline(steps=[
    ('drop_na', FunctionTransformer(drop_na)),
    ('drop_duplicated', FunctionTransformer(drop_duplicated)),
    ('label', FunctionTransformer(label)),
    ('clean', FunctionTransformer(clean_df)),
    ('reset_index', FunctionTransformer(reset_index))
])

In [6]:
x_features = ["statement", "subjects", "speaker", "context", "speaker_job_title", "state_info", 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire']
# x_features = ["statement"]
y_feature = ["output"]


train_df["input"] = train_df[x_features].apply(lambda row: ' '.join(map(str, row)), axis=1)
test_df["input"] = test_df[x_features].apply(lambda row: ' '.join(map(str, row)), axis=1)
valid_df["input"] = valid_df[x_features].apply(lambda row: ' '.join(map(str, row)), axis=1)

# Clean all datasets
train_df = cleaning_pipeline.fit_transform(train_df)
test_df = cleaning_pipeline.transform(test_df)
valid_df = cleaning_pipeline.transform(valid_df)


train_clean_df = train_df[["input", "output"]]
test_clean_df = test_df[["input", "output"]]
valid_clean_df = valid_df[["input", "output"]]


In [7]:
pd.set_option('display.max_colwidth', 500)
train_clean_df.head()

Unnamed: 0,input,output
0,declin coal start start natur ga took start begin presid georg w bush administr energi histori job accomplish scott surovel floor speech deleg virginia 0 0 1 1 0,1
1,hillari clinton agre john mccain vote give georg bush benefit doubt iran foreign polici barack obama denver presid illinoi 70 71 160 163 9,1
2,health care reform legisl like mandat free sex chang surgeri health care blog post news releas 7 19 3 5 44,0
3,econom turnaround start end term economi job charli crist interview cnn florida 15 9 20 19 2,1
4,chicago bear start quarterback last 10 total number tenur uw faculti fire dure last two decad educ robin vo onlin opinion piec wisconsin assembl speaker wisconsin 0 3 2 5 1,1


In [None]:
# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2)


In [9]:
# Tokenize and encode the training and validation sets
train_encodings = tokenizer(train_clean_df["input"].tolist(), truncation=True, padding=True, return_tensors="tf")
val_encodings = tokenizer(valid_clean_df["input"].tolist(), truncation=True, padding=True, return_tensors="tf")


In [10]:
# Convert labels to TensorFlow tensors
train_labels = tf.convert_to_tensor(train_clean_df["output"])
val_labels = tf.convert_to_tensor(valid_clean_df["output"])

In [11]:
# Define the model
input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="attention_mask")


In [12]:
bert_output = model(input_ids, attention_mask=attention_mask)
output = tf.keras.layers.Dense(2, activation="softmax")(bert_output.logits)


In [13]:
tf_model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)


In [None]:
# Compile the model
tf_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.3), loss="sparse_categorical_crossentropy", metrics=["accuracy"])


In [None]:
# Train the model
history = tf_model.fit(
    {"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]},
    train_labels,
    validation_data=({"input_ids": val_encodings["input_ids"], "attention_mask": val_encodings["attention_mask"]}, val_labels),
    epochs=3,
    batch_size=512,
)


In [None]:
# Evaluate the model on the validation set
val_predictions = tf_model.predict({"input_ids": val_encodings["input_ids"], "attention_mask": val_encodings["attention_mask"]})
val_predictions_labels = tf.argmax(val_predictions, axis=1).numpy()

val_accuracy = accuracy_score(val_labels, val_predictions_labels)
print("Validation Accuracy:", val_accuracy)