In [1]:
import pandas as pd
import numpy as np
import re

In [3]:
df = pd.read_csv('C:/Users/LShel/Downloads/archive (3)/Combined_News_DJIA.csv')

In [4]:
cols = []
for col in df.columns:
    if col != 'Date' and col != 'Label':
        cols.append(col)

In [5]:
def filter_strings(string):
    if not isinstance(string, str):
        return ''
    no_p = re.sub(r"[.]", '', string)
    ft = re.sub(r"[^a-zA-Z0-9]", ' ', no_p)
    filtered_text = re.sub(r'\bb\b', '', ft)
    filtered_text = re.sub(r'\s+', ' ', filtered_text)
    filtered_text = filtered_text.strip()

    return filtered_text

In [6]:
sents = []

for ind, row in df.iterrows():
    s = []
    for c in cols:
        s.append(filter_strings(row[c]))
    sents.append(s)

In [None]:
import spacy
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")

sents_filt = []

for s in tqdm(sents):
    filtered = []
    for sent in s:
        w_ent = []
        if isinstance(sent, str):
            doc = nlp(sent)
            for ent in doc.ents:
                w_ent.append(ent.label_)
            if 'ORG' in w_ent or 'GPE' in w_ent or 'NORP' in w_ent:
                filtered.append(sent)
    sents_filt.append(filtered)

In [8]:
text_comb = []

for sents in sents_filt:
    index = 0
    s = ''
    for sent in sents:
        index = index + 1
        if index == len(sents):
            s += sent
        else:
            s += sent + ' ' 
    text_comb.append(s) 

In [9]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

text_filtered = []

for t in text_comb:
    toks = t.split()
    processed_words = [
        tok for tok in toks if tok.lower() not in stop_words
    ]
    processed_sentence = ' '.join(processed_words)

    text_filtered.append(processed_sentence)

In [10]:
labels = df.Label.values

In [11]:
#----------#

In [12]:
from transformers import BertTokenizer

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_cased = True)

In [14]:
input_ids = []
for sent in text_filtered:
    encoded_sent = tokenizer.encode(sent, add_special_tokens = True)
    input_ids.append(encoded_sent)

In [None]:
print('Max Sentence length', max([len(sen) for sen in input_ids]))

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
MAX_LEN = 256

print('padding/truncating all sentences to %d values' % MAX_LEN)
print('padding token:"{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', value=0, truncating='post', padding='post')

print('Done')

In [18]:
attention_mask = []

for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    
    attention_mask.append(att_mask)

In [19]:
#----------#

In [None]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_labels, test_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.1)

train_mask, test_mask, _, _ = train_test_split(attention_mask, labels, random_state=42, test_size=0.1)

train_input, validation_input, train_labels, validation_labels = train_test_split(train_input, train_labels, random_state=42, test_size=0.1)

train_mask, validation_mask, _, _ = train_test_split(train_mask, train_mask, random_state=42, test_size=0.1)

In [None]:
print('---Train---')
print('input: ', train_input.shape)
print('label: ', train_labels.shape)
print('mask: ', np.array(train_mask).shape)

print('---Validation---')
print('input: ', validation_input.shape)
print('label: ', validation_labels.shape)
print('mask: ', np.array(validation_mask).shape)

print('---Test---')
print('input: ', test_input.shape)
print('label: ', test_labels.shape)
print('mask: ', np.array(test_mask).shape)

In [None]:
import torch

train_input = torch.tensor(train_input)
train_labels = torch.tensor(train_labels)
train_mask = torch.tensor(train_mask)

validation_input = torch.tensor(validation_input)
validation_labels = torch.tensor(validation_labels)
validation_mask = torch.tensor(validation_mask)

test_input = torch.tensor(test_input)
test_labels = torch.tensor(test_labels)
test_mask = torch.tensor(test_mask)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 8

train_data = TensorDataset(train_input, train_mask, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_input, validation_mask, validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input, test_mask, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2, output_attentions = False, output_hidden_states = False, hidden_dropout_prob=0.2)

In [None]:
params = list(model.named_parameters())

print("The BERT model has {:} different named parameters.".format(len(params)))

print("==== Embedding Layer ====")
for p in params[0:5]:
  print("{:<60} {:>12}".format(p[0], str(tuple(p[1].size()))))

print("==== First Transformers ====")
for p in params[5:21]:
  print("{:<60} {:>12}".format(p[0], str(tuple(p[1].size()))))

print("==== Output Layer ====")
for p in params[-4:]:
  print("{:<60} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
optimizer = AdamW(model.parameters(), lr = 1e-5, eps = 1e-8 )

In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 5

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
if torch.cuda.is_available():
  device = torch.device('cuda')

  print('there are %d GPU(s) available.' % torch.cuda.device_count())

  print('we will use the GPU: ', torch.cuda.get_device_name(0))

else:
  print("No GPU available, using the CPU instead")
  device = torch.device("cpu")

In [None]:
import time
import random
import datetime

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

model.to(device)

for epoch_i in range(0, epochs):

    # ===================================
    #              Training
    # ===================================

    print("======= Epoch {:} / {:} =======".format(epoch_i+1, epochs))
    print("Training...")

    t0 = time.time()

    total_loss = 0

    model.train()


    # For each batch of training data
    for step, batch in enumerate(train_dataloader):
        
        # Progress update every 40 batches
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)

            print("Batch {:>5,} of {:>5,}.     Elapsed: {:}".format(step, len(train_dataloader), elapsed))

    
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device).long()

        model.zero_grad()

        outputs = model(b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask,
                    labels=b_labels)
    
        loss = outputs[0]

        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()


    avg_train_loss = total_loss / len(train_dataloader)

    loss_values.append(avg_train_loss)

    print("   Average training loss: {0:.2f}".format(avg_train_loss))
    print("   Training epoch took: {:}".format(format_time(time.time() - t0)))

    # ===================================
    #             Validation
    # ===================================

    print("Running Validation...")

    t0 = time.time()

    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in validation_dataloader:

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids,
                      token_type_ids=None,
                      attention_mask=b_input_mask)
    
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_accuracy += tmp_eval_accuracy

        nb_eval_steps += 1
  
    print("Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("Validation took: {:}".format(format_time(time.time() - t0)))
print("Training complete!")

In [None]:
print("Predicting labels for {:,} test sentences".format(len(test_input)))

model.eval()

prediction, true_labels = [], []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask)
    
    logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    prediction.append(logits)
    true_labels.append(label_ids)

print(" DONE.")

In [None]:
from sklearn.metrics import matthews_corrcoef

flat_prediction = [item for sublist in prediction for item in sublist]
flat_prediction = np.argmax(flat_prediction, axis=1).flatten()

flat_true_labels = [item for sublist in true_labels for item in sublist]

mcc = matthews_corrcoef(flat_true_labels, flat_prediction)

print("MCC: %.3f" %mcc)

In [None]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(flat_true_labels, flat_prediction)

print("ACC: %.3f" %acc)

In [None]:
#----------#