In [1]:
# This program is based on the project and its tutorial.
# https://huggingface.co/
# Dataset is from 
# https://www.kaggle.com/kritanjalijain/amazon-reviews

In [2]:
!pip install transformers
!pip install datasets
!python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
!python -c "from datasets import load_dataset; print(load_dataset('squad', split='train')[0])"

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 8.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 49.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 58.2 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled P

In [3]:
from google.colab import drive

drive.mount('/content/drive')
path = 'drive/My Drive/'

Mounted at /content/drive


In [4]:
!apt-get update
!pip3 install tqdm
import tqdm
print(tqdm.__version__)

0% [Working]            Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Waiting for headers] [Co0% [2 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.152)                                                                               Get:4 https://developer.download.nvidia.com/comp

In [5]:
# read dataset
import pandas as pd

df_train = pd.read_csv('drive/My Drive/amazon_review_polarity_csv/train.csv',names=('labels','key','text'))
df_train = df_train.dropna()

df_test = pd.read_csv('drive/My Drive/amazon_review_polarity_csv/test.csv',names=('labels','key','text'))
df_test = df_test.dropna()

In [6]:
df_train[0:4]

Unnamed: 0,labels,key,text
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...


In [7]:
# Colab(not pro)'s RAM is not enough. So we fine-tune the model with 0.1% data 
pos = int(len(df_train)*0.001)
df_train = df_train[0:pos]
#df_test = df_test[0:pos]

In [8]:
train_labels = df_train['labels'].values.tolist()
train_key = df_train['key'].values.tolist()

for i in range(len(train_labels)):
  if train_labels[i] == 2:
    train_labels[i] = 1
  else:
    train_labels[i] = 0


test_labels = df_test['labels'].values.tolist()
test_key = df_test['key'].values.tolist()

for i in range(len(test_labels)):
  if test_labels[i] == 2:
    test_labels[i] = 1
  else:
    test_labels[i] = 0

In [9]:
from sklearn.model_selection import train_test_split
train_key, eval_key, train_labels, eval_labels = train_test_split(train_key, train_labels, test_size=.2)

In [10]:
# preprocessing words->token

from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_key, truncation=True, padding=True)
eval_encodings = tokenizer(eval_key, truncation=True, padding=True)
test_encodings = tokenizer(test_key, truncation=True, padding=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

In [11]:
# Build dataset same as content in toturial in huggingface

import torch

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MyDataset(train_encodings, train_labels)
eval_dataset = MyDataset(eval_encodings, eval_labels)
test_dataset = MyDataset(test_encodings, test_labels)

In [12]:
# Train the model with training data(split train and validation)

import numpy as np
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    recall = recall_score(labels,predictions)
    f1 = f1_score(labels,predictions)
    precision = precision_score(labels,predictions)
    accuracy = accuracy_score(labels,predictions)
    result = {
        'accuracy':accuracy,
        'f1':f1,
        'precision':precision,
        'recall':recall
    }
    return result

training_args = TrainingArguments(
    output_dir='drive/My Drive/amazon_review_polarity_csv',
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=100,
    weight_decay=0.01,
    evaluation_strategy='steps',
    save_strategy='epoch',
    seed=0,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.9448,0.876056,0.8125,0.791345,0.882759,0.717087
1000,0.8962,0.9157,0.7875,0.787204,0.781768,0.792717
1500,0.8905,0.843846,0.806944,0.820181,0.762019,0.887955
2000,0.8262,0.776234,0.826389,0.829001,0.81016,0.848739
2500,0.9356,0.89439,0.826389,0.829001,0.81016,0.848739
3000,0.7876,0.907731,0.830556,0.833787,0.811671,0.857143
3500,0.6006,0.823276,0.829167,0.816692,0.872611,0.767507
4000,0.5173,1.088186,0.8125,0.801762,0.842593,0.764706
4500,0.445,1.129028,0.827778,0.818713,0.856269,0.784314
5000,0.46,1.182313,0.806944,0.783826,0.881119,0.705882


***** Running Evaluation *****
  Num examples = 720
  Batch size = 1
***** Running Evaluation *****
  Num examples = 720
  Batch size = 1
***** Running Evaluation *****
  Num examples = 720
  Batch size = 1
***** Running Evaluation *****
  Num examples = 720
  Batch size = 1
***** Running Evaluation *****
  Num examples = 720
  Batch size = 1
Saving model checkpoint to drive/My Drive/amazon_review_polarity_csv/checkpoint-2879
Configuration saved in drive/My Drive/amazon_review_polarity_csv/checkpoint-2879/config.json
Model weights saved in drive/My Drive/amazon_review_polarity_csv/checkpoint-2879/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 720
  Batch size = 1
***** Running Evaluation *****
  Num examples = 720
  Batch size = 1
***** Running Evaluation *****
  Num examples = 720
  Batch size = 1
***** Running Evaluation *****
  Num examples = 720
  Batch size = 1
***** Running Evaluation *****
  Num examples = 720
  Batch size = 1
***** Running Evaluation *****
  

TrainOutput(global_step=8637, training_loss=0.5624700504085236, metrics={'train_runtime': 448.3172, 'train_samples_per_second': 19.265, 'train_steps_per_second': 19.265, 'total_flos': 248041840553316.0, 'train_loss': 0.5624700504085236, 'epoch': 3.0})

In [13]:
path = 'drive/My Drive/amazon_review_polarity_csv/checkpoint-8637'

model = DistilBertForSequenceClassification.from_pretrained(path,num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)
trainer.evaluate()


loading configuration file drive/My Drive/amazon_review_polarity_csv/checkpoint-8637/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.9.2",
  "vocab_size": 30522
}

loading weights file drive/My Drive/amazon_review_polarity_csv/checkpoint-8637/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initia

{'eval_accuracy': 0.8130753268831721,
 'eval_f1': 0.809731270358306,
 'eval_loss': 1.17488431930542,
 'eval_precision': 0.8244896690039023,
 'eval_recall': 0.7954919323789856,
 'eval_runtime': 2373.3957,
 'eval_samples_per_second': 168.531,
 'eval_steps_per_second': 168.531}