In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 18.1MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 59.3MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 52.2MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertConfig, AdamW, BertTokenizer

from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset
import random
import re

# GPU Setup

In [None]:
import tensorflow as tf 

#Get GPU device name
device_name = tf.test.gpu_device_name()

#The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at : {}'.format(device_name))
else:
    raise SystemError('GPI Device not found!')

Found GPU at : /device:GPU:0


In [None]:
import torch

#if there is a GPU available
if torch.cuda.is_available():

    #Tell pytorch to use GPU
    device = torch.device('cuda')

    print("There are %d GPU(s) avcailable" % torch.cuda.device_count())
    print("We will use the GPU:", torch.cuda.get_device_name(0))

#if not
else:
    print('No GPU available, using CPU instead')
    device = torch.device('cpu')

There are 1 GPU(s) avcailable
We will use the GPU: Tesla T4


# Load Reference Dataset

In [None]:
business_path = "/content/drive/MyDrive/CCC/TweetData/business.txt"
entertainment_path = "/content/drive/MyDrive/CCC/TweetData/entertainment.txt"
fashion_path = "/content/drive/MyDrive/CCC/TweetData/fashion.txt"
food_path = "/content/drive/MyDrive/CCC/TweetData/food.txt"
gaming_path = "/content/drive/MyDrive/CCC/TweetData/gaming.txt"
health_path = "/content/drive/MyDrive/CCC/TweetData/health.txt"
music_path = "/content/drive/MyDrive/CCC/TweetData/music.txt"
politics_path = "/content/drive/MyDrive/CCC/TweetData/politics.txt"
sports_path = "/content/drive/MyDrive/CCC/TweetData/sports.txt"
technology_path = "/content/drive/MyDrive/CCC/TweetData/technology.txt"


business_data = [line.split('GMT')[1] for line in open(business_path) if len(line.split('GMT')) > 1]
entertainment_data = [line.split('GMT')[1] for line in open(entertainment_path) if len(line.split('GMT')) > 1]
fashion_data = [line.split('GMT')[1] for line in open(fashion_path) if len(line.split('GMT')) > 1]
food_data = [line.split('GMT')[1] for line in open(food_path) if len(line.split('GMT')) > 1]
gaming_data = [line.split('GMT')[1] for line in open(gaming_path) if len(line.split('GMT')) > 1]
health_data = [line.split('GMT')[1] for line in open(health_path) if len(line.split('GMT')) > 1]
music_data = [line.split('GMT')[1] for line in open(music_path) if len(line.split('GMT')) > 1]
politics_data = [line.split('GMT')[1] for line in open(politics_path) if len(line.split('GMT')) > 1]
sports_data = [line.split('GMT')[1] for line in open(sports_path) if len(line.split('GMT')) > 1]
technology_data_total = [line.split('UTC')[1] for line in open(technology_path) if len(line.split('UTC')) > 1]
technology_data = random.sample(technology_data_total, 32000)

In [None]:
def Get_data_dict(data_list, data_names):
    data_dict = {}
    for data, name in list(zip(data_list, data_names)):
        data_dict[name] = data

    return data_dict

data_list = [business_data, entertainment_data, fashion_data, food_data, gaming_data, health_data, music_data, politics_data, sports_data, technology_data]
data_names = ['business', 'entertainment', 'fashion', 'food', 'gaming', 'health', 'music', 'politics', 'sports', 'technology']
training_data_dict = Get_data_dict(data_list, data_names)

In [None]:
# training_data_dict

In [None]:
print('business_data length:        ',len(business_data))
print('entertainment_path length:   ',len(entertainment_data))
print('fashion_path length:         ',len(fashion_data))
print('food_path length:            ',len(food_data))
print('gaming_path length:          ',len(gaming_data))
print('health_path length:          ',len(health_data))
print('music_path length:           ',len(music_data))
print('politics_path length:        ',len(politics_data))
print('sports_path length:          ',len(sports_data))
print('technology_path length:      ',len(technology_data))


business_data length:         32811
entertainment_path length:    30002
fashion_path length:          30002
food_path length:             30017
gaming_path length:           30012
health_path length:           30014
music_path length:            30005
politics_path length:         30001
sports_path length:           30012
technology_path length:       32000


# Prep Data

In [None]:
def Clean_Tweets(data):
    tweet_text_noLinks = (re.sub(r'(https:\/\/t\.co\/[\/\.a-z0-9]+)|(http:[\/\.a-z0-9]+)', ' ', (data).lower()))
    tweet_text_noPunc_noLinks = (re.sub(r'[,\.!?\/\;\:\#\\\'\"]', ' ', tweet_text_noLinks))
    tweet_text_noPunc_noLinks = re.sub(r'(\\n)|(\\t)|[\(\)\\]', ' ', tweet_text_noPunc_noLinks)
    tweet_text_noPunc_noLinks_noUser = re.sub(r'\@\w+', ' ', tweet_text_noPunc_noLinks)
    tweet_text_noPunc_noLinks_noUser = re.sub(r'\<.+\>', ' ', tweet_text_noPunc_noLinks_noUser)

    return tweet_text_noPunc_noLinks_noUser

def Get_Labeled_Data(data_dict):
    data_df_list = []
    for data in data_dict.keys():
        data_df = pd.DataFrame()
        data_df['data'] = data_dict[data]
        data_df['label'] = data
        data_df_list.append(data_df)

    total_data_df = pd.concat(data_df_list, ignore_index=True)
    return total_data_df

In [None]:
training_data_df = Get_Labeled_Data(training_data_dict)
training_data_df['cleaned_text'] = training_data_df['data'].apply(lambda x: Clean_Tweets(x))

In [None]:
training_data_df

Unnamed: 0,data,label,cleaned_text
0,<melissanmorgan> I’m pretty sure my boss expl...,business,i’m pretty sure my boss explicitly waits un...
1,<Incog_Negro_> @Dnhleza Yes iyadzingeka coz a...,business,yes iyadzingeka coz at the end of the day...
2,<bmatic33> Pussy gets thrown at me more when ...,business,pussy gets thrown at me more when i m focus...
3,<xgretty> So I’m just over here minding my bu...,business,listening to binaural beats when suddenly i...
4,<jmints1775> @ColeHarrisCA Jobs start with bu...,business,jobs start with business and if education...
...,...,...,...
304871,<DataRobot> We’re empowering all kinds of org...,technology,we’re empowering all kinds of organizations...
304872,<DailyScene> Our digital future will be shape...,technology,our digital future will be shaped by increa...
304873,<LEDLightingSA> #Repost @LineaLightGroupRedef...,technology,repost spaces through light discover ho...
304874,<Fenergo> #ACAMSHollywood #Regulatory Roundta...,technology,acamshollywood regulatory roundtable - oc...


In [None]:
from sklearn.model_selection import train_test_split
train_df, dev_df = train_test_split(training_data_df, test_size=0.2, random_state=42, shuffle=True)


# Encoding Data

In [None]:
# # Converting the codes to appropriate categories using a dictionary
my_dict = {
    'business':'b', 
    'entertainment':'e', 
    'fashion':'fa', 
    'food':'fo', 
    'gaming':'g', 
    'health':'h', 
    'music':'m', 
    'politics':'p', 
    'sports':'s', 
    'technology':'t'
}

def update_cat(x):
    return my_dict[x]

train_data_df = train_df.copy()
dev_data_df = dev_df.copy()

train_data_df['LABEL'] = train_data_df['label'].apply(lambda x: update_cat(x))
dev_data_df['LABEL'] = dev_data_df['label'].apply(lambda x: update_cat(x))

encode_dict = {}

def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

train_data_df['ENCODE_LABEL'] = train_data_df['LABEL'].apply(lambda x: encode_cat(x))
dev_data_df['ENCODE_LABEL'] = dev_data_df['LABEL'].apply(lambda x: encode_cat(x))

In [None]:
train_data_df

Unnamed: 0,data,label,cleaned_text,LABEL,ENCODE_LABEL
46787,<plexandiptv> movie - Greg Davies: You Magni...,entertainment,movie - greg davies you magnificent beast...,e,0
5055,<enzonetwork> 4. The incentive structure that...,business,4 the incentive structure that drives the ...,b,1
287454,<ClouDatAI> ImpressAnyoneCollaboration Techno...,technology,impressanyonecollaboration technology inter...,t,2
150341,<ralKads> @HLTVorg @FNATIC @TeamLiquid @mouse...,gaming,the last time fnatic pl...,g,3
147271,<ProdByIcy> I’m sorry guys but I have only wo...,gaming,i’m sorry guys but i have only worked on mu...,g,3
...,...,...,...,...,...
119879,<jodiemccutcheon> Me n Caitlin went our first...,food,a swear she’s no bumped bread up her jumper...,fo,9
259178,<debbybax21> #ClassOfMumAndDad....well done M...,sports,classofmumanddad well done mark i fee...,s,5
131932,<Stephanie64030> Help me win a gaming pc from...,gaming,help me win a gaming pc from xidax https ...,g,3
146867,<Red_Emzy> @JadeMarieGarcia SAME! im a huge f...,gaming,if you haven t seen the movie ready player ...,g,3


In [None]:
X_offensive_new_train_data_csv = train_data_df['cleaned_text']
y_offensive_new_train_data_csv = train_data_df['ENCODE_LABEL']

X_offensive_new_dev_data_csv = dev_data_df['cleaned_text']
y_offensive_new_dev_data_csv = dev_data_df['ENCODE_LABEL']

In [None]:
MAX_LEN = 128

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
                                          
print("Encoding Started...")

encoded_data_train = tokenizer.batch_encode_plus(
    X_offensive_new_train_data_csv.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    padding='max_length',
    max_length=MAX_LEN, 
    return_tensors='pt',
    truncation=True
)

print("Train data encoding done!")

encoded_data_val = tokenizer.batch_encode_plus(
    X_offensive_new_dev_data_csv.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    padding='max_length',
    max_length=MAX_LEN, 
    return_tensors='pt',
    truncation=True
)

print("Dev data encoding done!")

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(y_offensive_new_train_data_csv.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(y_offensive_new_dev_data_csv.values)


dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…


Encoding Started...
Train data encoding done!
Dev data encoding done!


In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=10,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

# Data Loaders

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 30

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

# dataloader_test = DataLoader(dataset_test, 
#                                    sampler=SequentialSampler(dataset_test), 
#                                    batch_size=batch_size)



# Optimisation and Scheduling

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=2e-5, 
                  eps=1e-8)
                  
epochs = 6

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

# Training Loop

In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='macro'), f1_score(labels_flat, preds_flat, average='micro')

In [None]:
import random

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device).long() for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals




In [None]:
import random
import numpy as np

seed_val = 20
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device).long() for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)

    for batch in progress_bar:

        model.zero_grad()
        batch = tuple(b.to(device).long() for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'/content/drive/MyDrive/CCC/BERT_Classification_Model/BERT_classification_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Macro): {val_f1[0]}')
    tqdm.write(f'F1 Score (Micro): {val_f1[1]}')
    


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=8130.0, style=ProgressStyle(description_wid…


Epoch 1
Training loss: 0.32617608759412486
Validation loss: 0.2876039762131799
F1 Score (Macro): 0.9046914007958777
F1 Score (Micro): 0.9018302282865388


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=8130.0, style=ProgressStyle(description_wid…

# Evaluation

In [None]:
import torch.nn.functional as F
import pandas as pd 
from reliability_diagrams import *


def new_conficence_vals(old_confidences, Temp):
    softmax_preds = []
    for p in old_confidences:
        softmax_preds.append(F.softmax(torch.tensor(p/Temp)))

    return softmax_preds



def Get_Predictions(model, data_loader):
    _, predictions, true_vals = evaluate(data_loader)
    return predictions, true_vals


def Get_Model_df(seed_val, predictions, temp):

    logits = predictions[0]
    true_vals = predictions[1]

    model_softmax_preds = new_conficence_vals(logits, temp)
    model_confidences = [float(max(list(x))) for x in model_softmax_preds]

    model_df = pd.DataFrame()
    model_df['prediction_logits'] = list([list (x) for x in logits])
    model_df['true_label'] = [float(x) for x in true_vals]
    model_df['pred_label'] = [list(x).index(max(list(x))) for x in model_softmax_preds]
    model_df['confidence'] = [float(x) for x in model_confidences]

    return model_df


