In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 8.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 78.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 4.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 49.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.9 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacre

In [2]:
import os
from google.colab import drive
drive.mount('/content/drive')

cur_path = "/content/drive/My Drive/CS541/MiniProject"
os.chdir(cur_path)
os.listdir(cur_path)

Mounted at /content/drive


['covid19-cable-broadcast-labeled.csv', 'MiniProject.ipynb']

In [3]:
import pandas as pd
import numpy as np
import torch
import transformers as ppb
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
warnings.filterwarnings('ignore')

# Load Data

In [4]:
data = pd.read_csv('covid19-cable-broadcast-labeled.csv')

In [5]:
data.head(5)

Unnamed: 0,network,program,date,speech_turn,paragraph_sequence,paragraph,category
0,abc,worldnewstonight,2020-04-01,1,1,Were following that worsening situation off th...,covid_direct
1,abc,worldnewstonight,2020-04-01,2,1,As the two Holland America cruise ships approa...,covid_indirect
2,abc,worldnewstonight,2020-04-01,3,1,There is no time. These people need to get off...,covid_indirect
3,abc,worldnewstonight,2020-04-01,4,1,The Zaandam left Buenos Aires nearly a month a...,covid_indirect
4,abc,worldnewstonight,2020-04-01,5,1,"Clearly, weregonnabe willing to accept any Flo...",covid_indirect


In [6]:
data.category.value_counts()

non_covid         28027
covid_indirect    14588
covid_direct       6814
Name: category, dtype: int64

In [7]:
data['class'] = data['category'].apply(lambda x: 0 if x == 'non_covid' else 1)

In [8]:
data['class'].value_counts()

0    28027
1    21402
Name: class, dtype: int64

In [9]:
df = data.sample(10000)

# DistilBert & Logistic Regression

In [11]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
# Tokenize
tokenized = data['paragraph'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

# Run the model on a smaller size of data
#tokenized = df['paragraph'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [13]:
# Truncation + Padding
truncated = []
trunc_len = 10
for i in tokenized.values:
  if len(i) >= trunc_len: 
    _i = i[:trunc_len]
    truncated.append(_i)
  else: 
    _i = i + [0]*(trunc_len - len(i))
    truncated.append(_i)

truncated = np.array(truncated)

In [14]:
# Masking
attention_mask = np.where(truncated != 0, 1, 0)
attention_mask.shape

(49429, 10)

In [15]:
input_ids = torch.tensor(truncated)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [16]:
features = last_hidden_states[0][:,0,:].numpy()

In [17]:
labels = data['class']

#labels = df['class']

In [18]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2)

In [19]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 10.526405263157894}
best scrores:  0.6907163285611053


In [20]:
lr_clf = LogisticRegression(C=5.3)
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=5.3)

In [21]:
lr_clf.score(test_features, test_labels)

0.692190977139389

In [22]:
test_pred = lr_clf.predict(test_features)

In [23]:
print(classification_report(test_labels, test_pred))

              precision    recall  f1-score   support

           0       0.71      0.77      0.74      5615
           1       0.66      0.59      0.62      4271

    accuracy                           0.69      9886
   macro avg       0.69      0.68      0.68      9886
weighted avg       0.69      0.69      0.69      9886



# Bert

In [95]:
from torch.utils.data import TensorDataset, DataLoader, random_split
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup

In [96]:
device = torch.device('cuda')

In [97]:
text_values = data.paragraph.values
text_labels = data['class'].values

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def encode_fn(text_data):
    all_input_ids = []    
    for text in text_data:
        input_ids = tokenizer.encode(text, add_special_tokens = True, \
                                     truncation=True, max_length = 50, \
                                     pad_to_max_length = True, return_tensors = 'pt')
        all_input_ids.append(input_ids)    
    all_input_ids = torch.cat(all_input_ids, dim=0)
    return all_input_ids

all_input_ids = encode_fn(text_values)
text_labels = torch.tensor(text_labels)

In [98]:
batch_size = 64
# Split data into train and validation
dataset = TensorDataset(all_input_ids, text_labels)
train_size = int(0.80 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
test_size = int(0.5 * val_size)
val_dataset, test_dataset = random_split(val_dataset, [test_size, test_size])


In [99]:
# Create train and validation dataloaders
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
val_dataloader = DataLoader(val_dataset, batch_size = batch_size, shuffle = False)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False) 

In [100]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, output_attentions=False, output_hidden_states=False, return_dict=False)
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [101]:
# create optimizer and learning rate schedule
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [102]:
from sklearn.metrics import f1_score, accuracy_score

def flat_accuracy(preds, labels):
    
    """A function for calculating accuracy scores"""
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, pred_flat)

In [103]:
def train_model(model, train_dataloader, val_dataloader, test_dataloader, device, epochs):

    # Training start
    for epoch in range(epochs):
        model.train()
        total_loss, total_val_loss = 0, 0
        total_eval_accuracy = 0
        for step, batch in enumerate(train_dataloader):
            model.zero_grad()
            loss, logits = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0]>0).to(device), labels=batch[1].to(device))
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step() 
            scheduler.step()
            
        model.eval()
        for i, batch in enumerate(val_dataloader):
            with torch.no_grad():
                loss, logits = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0]>0).to(device), labels=batch[1].to(device))
                    
                total_val_loss += loss.item()
                
                logits = logits.detach().cpu().numpy()
                label_ids = batch[1].to('cpu').numpy()
                total_eval_accuracy += flat_accuracy(logits, label_ids)

        # Test start
        Y = []
        preds = []
        model.eval()
        for i, batch in enumerate(test_dataloader):
            with torch.no_grad():
                loss, logits = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0]>0).to(device), labels=batch[1].to(device))
                Y.append(batch[1].numpy())
                preds.append(logits.cpu().numpy())
        
        Y = np.concatenate(Y, axis =0)
        preds = np.concatenate(preds, axis =0)    
        y_pred = [0 if x[0] > x[1] else 1 for x in preds]   
        
        avg_train_loss = total_loss / len(train_dataloader)
        avg_val_loss = total_val_loss / len(val_dataloader)
        avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
        
        print(f'Epoch: {epoch}')
        print(f'Train loss: {avg_train_loss}')
        print(f'Validation loss: {avg_val_loss}')
        print(f'Accuracy: {avg_val_accuracy:.2f}')
        print('\n')
    
    return Y, y_pred


In [104]:
y_true, y_pred = train_model(model, train_dataloader, val_dataloader, test_dataloader, device, 10)

Epoch: 0
Train loss: 0.3297460374684588
Validation loss: 0.26897799749022877
Accuracy: 0.89


Epoch: 1
Train loss: 0.21174002722098603
Validation loss: 0.26915953126855385
Accuracy: 0.90


Epoch: 2
Train loss: 0.1342229685312119
Validation loss: 0.3095118778829391
Accuracy: 0.89


Epoch: 3
Train loss: 0.0824686632548914
Validation loss: 0.36998981886949295
Accuracy: 0.90


Epoch: 4
Train loss: 0.05088709619703995
Validation loss: 0.46678840359434104
Accuracy: 0.89


Epoch: 5
Train loss: 0.035804424226181554
Validation loss: 0.5227654722447579
Accuracy: 0.89


Epoch: 6
Train loss: 0.02505413526168069
Validation loss: 0.5952827330583181
Accuracy: 0.89


Epoch: 7
Train loss: 0.019995815992198244
Validation loss: 0.6154261963107647
Accuracy: 0.89


Epoch: 8
Train loss: 0.014896829850516963
Validation loss: 0.6404303442209195
Accuracy: 0.89


Epoch: 9
Train loss: 0.012740606259606239
Validation loss: 0.6613637984563143
Accuracy: 0.89




In [105]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.93      0.91      2834
           1       0.90      0.86      0.88      2109

    accuracy                           0.90      4943
   macro avg       0.90      0.89      0.90      4943
weighted avg       0.90      0.90      0.90      4943

