<a href="https://colab.research.google.com/github/Joongeun/Internship/blob/main/FINAL_distilbert_with_trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets transformers==4.28.0

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m88.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datase

In [None]:
!pip install transformers
import gzip
import shutil
import time

import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext

import transformers
from transformers import DistilBertForSequenceClassification, AdamW



## General Settings

In [None]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

#VOCABULARY_SIZE = 20000
#LEARNING_RATE = 0.005
#BATCH_SIZE = 128
NUM_EPOCHS = 15
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

#EMBEDDING_DIM = 128
#HIDDEN_DIM = 256
#NUM_CLASSES = 2

In [None]:
all_df = pd.read_csv('/content/Labeled Posts - preprocessed_csv.csv')
df = all_df[:3500]
df.head()

Unnamed: 0,selftext,labels
0,Comment and post if you have any questions abo...,0.0
1,http://www.nejm.org/doi/full/10.1056/NEJMc1413...,0.0
2,"This sub doesn't have a lot of activity, but I...",1.0
3,I moved from cigarettes to my vape three years...,4.0
4,I vape constantly.. started about 5 years ago....,4.0


In [None]:
df.shape

(3500, 2)

## Split Dataset into Train/Validation/Test

In [None]:
from sklearn.model_selection import train_test_split
X = df['selftext'].values
y = df['labels'].values
X_train, test_texts, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)
train_texts, valid_texts, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.25)
train_labels = [1 if label in [2.0, 4.0] else 0 for label in y_train]
valid_labels = [1 if label in [2.0, 4.0] else 0 for label in y_val]
test_labels = [1 if label in [2.0, 4.0] else 0 for label in y_test]

## Tokenization

In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

prediction_encodings = tokenizer(list(all_df['selftext'][3500:]), truncation=True, padding=True)
prediction_labels = list(all_df['labels'][3500:])

In [None]:
prediction_labels[0]

nan

## Dataset Class and Loaders

In [None]:
class RedditDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = RedditDataset(train_encodings, train_labels)
valid_dataset = RedditDataset(valid_encodings, valid_labels)
test_dataset = RedditDataset(test_encodings, test_labels)

prediction_dataset = RedditDataset(prediction_encodings, prediction_labels)

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=20, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=20, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=20, shuffle=True)

prediction_dataset = torch.utils.data.DataLoader(prediction_dataset, batch_size=20)

## Load Model

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(DEVICE)
model.train()

optim = AdamW(model.parameters(), lr=7e-5)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_clas

## Train Model

In [None]:
import numpy as np
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
def compute_accuracy(p):
  pred, labels = p
  pred = np.argmax(pred, axis=1)
  accuracy = accuracy_score(y_true=labels, y_pred=pred)
  recall = recall_score(y_true=labels, y_pred=pred)
  precision = precision_score(y_true=labels, y_pred=pred)
  f1 = f1_score(y_true=labels, y_pred=pred)
  return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
from torch import nn
from transformers import Trainer, TrainingArguments

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.0551703346, 1]))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels).to('cpu'), labels.view(-1).to('cpu'))
        return (loss, outputs) if return_outputs else loss

In [None]:
from transformers import EarlyStoppingCallback, IntervalStrategy
from torch.optim.lr_scheduler import CosineAnnealingLR
training_args = TrainingArguments(
    evaluation_strategy = "epoch", #IntervalStrategy.STEPS,
    save_strategy = "epoch",
    eval_steps = 1,                 # Evaluation and Save happens every 50 steps
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=20,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=10,                # number of warmup steps for learning rate scheduler
    weight_decay=0,               # strength of weight decay
    # logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    metric_for_best_model = 'loss',
    # load_best_model_at_end=True,
    learning_rate = 2e-5,
    optim = 'adamw_torch')
    # lr_scheduler_type = "cosine_with_restarts")

    # lr_scheduler_type = ['linear', 'cosine', 'cosine_with_restarts', 'polynomial', 'constant', 'constant_with_warmup', 'inverse_sqrt']

trainer = CustomTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_accuracy)
    # callbacks = [EarlyStoppingCallback(early_stopping_patience=25)]

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2229,0.162246,0.835714,0.72043,0.960573,0.823349
2,0.1645,0.157645,0.88,0.787611,0.956989,0.864078
3,0.0978,0.174388,0.872857,0.777778,0.953405,0.856683
4,0.0851,0.242085,0.895714,0.830128,0.928315,0.876481
5,0.0686,0.260485,0.894286,0.831715,0.921147,0.87415


TrainOutput(global_step=525, training_loss=0.13095084161985487, metrics={'train_runtime': 171.7956, 'train_samples_per_second': 61.119, 'train_steps_per_second': 3.056, 'total_flos': 1390907685888000.0, 'train_loss': 0.13095084161985487, 'epoch': 5.0})

## Evaluate

In [None]:
from sklearn.metrics import f1_score
def compute_accuracy(model, data_loader, device):
    all_preds, all_labels = [], []
    with torch.no_grad():

        correct_pred, num_examples = 0, 0
        for batch_idx, batch in enumerate(data_loader):

            ### Prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits = outputs['loss'], outputs['logits']

            _, predicted_labels = torch.max(logits, 1)
            labels = labels.cpu().tolist()
            predicted_labels = predicted_labels.cpu().tolist()
            all_labels.append(labels)
            all_preds.append(predicted_labels)
    all_preds = [j for i in all_preds for j in i]
    all_labels = [j for i in all_labels for j in i]
    print(all_preds)
    accuracy = accuracy_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
model.eval()
model.to(DEVICE)
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE)}%')

[1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 

In [None]:
model.eval()
model.to(DEVICE)
all_preds = []
with torch.no_grad():
    correct_pred, num_examples = 0, 0
    for batch_idx, batch in enumerate(prediction_dataset):

        ### Prepare data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs['logits']

        _, predicted_labels = torch.max(logits, 1)
        all_preds.append(predicted_labels.cpu().tolist())

all_preds = [j for i in all_preds for j in i]
print(all_preds)

[1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 

In [None]:
for i in range(3500, len(all_df)):
  all_df.at[i, 'labels'] = all_preds[i-3500]
print(all_df)

                                               selftext  labels
0     Comment and post if you have any questions abo...     0.0
1     http://www.nejm.org/doi/full/10.1056/NEJMc1413...     0.0
2     This sub doesn't have a lot of activity, but I...     1.0
3     I moved from cigarettes to my vape three years...     4.0
4     I vape constantly.. started about 5 years ago....     4.0
...                                                 ...     ...
7746  I came so close. I walked to the store for a b...     1.0
7747  This is my 4th day, I’ve only hit my vape once...     0.0
7748  I tried a zyn pouch after 2 weeks no nic and i...     1.0
7749  I am struggling! I have been battling addictio...     1.0
7750  Quitting forever is my one and only resolution...     0.0

[7751 rows x 2 columns]


In [None]:
import csv
all_df.to_csv("predicted_labels")

In [None]:
import pandas as pd
all_df = pd.read_csv('/content/predicted_labels.csv', index_col=0)
print(all_df)
for i in range(0, 3500):
  if all_df['labels'][i] in [2.0, 4.0]:
    all_df.at[i, 'labels'] = 1
    print(all_df['selftext'][i])
  else:
    all_df.at[i, 'labels'] = 0
all_df.to_csv("predicted_labels2.csv")

In [None]:
all_df

Unnamed: 0,selftext,labels
0,Comment and post if you have any questions abo...,0.0
1,http://www.nejm.org/doi/full/10.1056/NEJMc1413...,0.0
2,"This sub doesn't have a lot of activity, but I...",0.0
3,I moved from cigarettes to my vape three years...,1.0
4,I vape constantly.. started about 5 years ago....,1.0
...,...,...
7746,I came so close. I walked to the store for a b...,1.0
7747,"This is my 4th day, I’ve only hit my vape once...",0.0
7748,I tried a zyn pouch after 2 weeks no nic and i...,1.0
7749,I am struggling! I have been battling addictio...,1.0


In [None]:
TP = 90
TN = 3250
prev_val = 0
for i in range(0, 180, 5):
  FP = 183-TP
  FN = 3317-TN
  new_val = 2*TP / (2 * TP + FP + FN)
  diff = new_val - prev_val
  print(new_val, diff, i / 5)
  prev_val = new_val
  TP += 5


0.5294117647058824 0.5294117647058824 0.0
0.5507246376811594 0.021312872975277064 1.0
0.5714285714285714 0.020703933747411973 2.0
0.5915492957746479 0.02012072434607648 3.0
0.6111111111111112 0.019561815336463284 4.0
0.6301369863013698 0.019025875190258668 5.0
0.6486486486486487 0.018511662347278857 6.0
0.6666666666666666 0.018018018018017945 7.0
0.6842105263157895 0.01754385964912286 8.0
0.7012987012987013 0.01708817498291182 9.0
0.717948717948718 0.01665001665001664 10.0
0.7341772151898734 0.016228497241155493 11.0
0.75 0.015822784810126556 12.0
0.7654320987654321 0.015432098765432056 13.0
0.7804878048780488 0.015055706112616751 14.0
0.7951807228915663 0.014692918013517464 15.0
0.8095238095238095 0.014343086632243263 16.0
0.8235294117647058 0.014005602240896309 17.0
0.8372093023255814 0.013679890560875596 18.0
0.8505747126436781 0.013365410318096682 19.0
0.8636363636363636 0.013061650992685525 20.0
0.8764044943820225 0.012768130745658857 21.0
0.8888888888888888 0.012484394506866336 2