# Demonstration - Get Data - Train Model - IMDB

In [1]:
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
!pip install git+https://github.com/Julien2048/NLP_Project_OOD.git
!git clone https://github.com/Julien2048/NLP_Project_OOD
!pip install -r NLP_Project_OOD/requirements.txt

Installing collected packages: tokenizers, xxhash, dill, responses, multiprocess, huggingface-hub, transformers, datasets
Successfully installed datasets-2.10.1 dill-0.3.6 huggingface-hub-0.13.0 multiprocess-0.70.14 responses-0.18.0 tokenizers-0.13.2 transformers-4.26.1 xxhash-3.2.0


In [4]:
%cd gdrive/MyDrive/NLP_Project_OOD/distilbert
!rm /usr/local/lib/python3.9/dist-packages/transformers/models/distilbert/modeling_distilbert.py
!cp modeling_distilbert.py /usr/local/lib/python3.9/dist-packages/transformers/models/distilbert/

!rm /usr/local/lib/python3.9/dist-packages/transformers/models/distilbert/__init__.py
!cp __init__.py /usr/local/lib/python3.9/dist-packages/transformers/models/distilbert/
%cd ..

%cd transformers
!rm /usr/local/lib/python3.9/dist-packages/transformers/__init__.py
!cp __init__.py /usr/local/lib/python3.9/dist-packages/transformers/
%cd ..
%cd ..
%cd ..
%cd ..

/content/gdrive/MyDrive/NLP_Project_OOD/distilbert
/content/gdrive/MyDrive/NLP_Project_OOD
/content/gdrive/MyDrive/NLP_Project_OOD/transformers
/content/gdrive/MyDrive/NLP_Project_OOD
/content/gdrive/MyDrive
/content/gdrive
/content


In [5]:
!mkdir ~/.kaggle/ 
!cp kaggle.json ~/.kaggle/ # The kaggle.json is a file download from kaggle with your API codes (each has one to login)
!chmod 600 ~/.kaggle/kaggle.json 
!kaggle datasets download -d nltkdata/movie-review
!unzip /content/movie-review.zip -d /content/

In [6]:
gdown.download("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", output="aclImdb_v1.tar.gz", quiet=False)
!tar xzf /content/aclImdb_v1.tar.gz

In [5]:
import torch

In [6]:
from nlp_project.data_loader import IMDBDataset, MovieReviewDataset, MNLIDataset, SST2Dataset, RTEDataset, CreateDataset
from nlp_project.OOD_detector import Mahalanobis
from nlp_project.metrics import ResultsOOD
from nlp_project.BERT_adaptation import TokenizeData, DistilBertClassifier

## Load Datasets

### Load IMDB Dataset

In [None]:
get_imdb = IMDBDataset()
imdb_train_texts, imdb_test_texts, imdb_train_labels, imdb_test_labels = get_imdb.get_dataset()
get_imdb.save_labels()
get_imdb.save_texts()

### Load Movie Review Dataset

In [None]:
get_movie = MovieReviewDataset()
movie_review_train_texts, movie_review_test_texts, movie_review_train_labels, movie_review_test_labels = get_movie.get_dataset()
get_movie.save_labels()
get_movie.save_texts()

### Load MNLI Dataset

In [None]:
mnli = MNLIDataset()
mnli_test_texts = mnli.get_dataset()
mnli.save_texts()

### Load SST2 Datasets

In [None]:
get_sst2 = SST2Dataset()
sst2_train_texts, sst2_test_texts, sst2_train_labels, sst2_test_labels = get_sst2.get_dataset()
get_sst2.save_labels()
get_sst2.save_texts()

### Load RTE Dataset

In [None]:
get_rte = RTEDataset()
rte_test_texts = get_rte.get_dataset()
get_rte.save_texts()

## Prelogits extraction

In [8]:
# Check if a GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

### Tokenize data

In [9]:
NB_TEXTS_TRAIN = 1600
NB_TEXTS_TEST = 400

#### Create tokens

In [None]:
tokenizer = TokenizeData()
imdb_input_ids_train, imdb_attention_masks_train = tokenizer(imdb_train_texts, "imdb", "train", 10000)
tokenizer.save_tokens()

imdb_input_ids_test, imdb_attention_masks_test = tokenizer(imdb_test_texts, "imdb", "test", NB_TEXTS_TEST)
tokenizer.save_tokens()

movie_review_input_ids_test, movie_review_attention_masks_test = tokenizer(movie_review_test_texts, "movie_review", "test", NB_TEXTS_TEST)
tokenizer.save_tokens()

mnli_input_ids_test, mnli_attention_masks_test = tokenizer(mnli_test_texts, "mnli", "test", NB_TEXTS_TEST)
tokenizer.save_tokens()

sst2_input_ids_test, sst2_attention_masks_test = tokenizer(sst2_test_texts, "sst2", "test", NB_TEXTS_TEST)
tokenizer.save_tokens()

rte_input_ids_test, rte_attention_masks_test = tokenizer(rte_test_texts, "rte", "test", NB_TEXTS_TEST)
tokenizer.save_tokens()

In [11]:
print("Shape IMDB Train Dataset: ", imdb_input_ids_train.shape)
print("Shape IMDB Test Dataset: ", imdb_input_ids_test.shape)
print("Shape Movie Review Test Dataset: ", movie_review_input_ids_test.shape)
print("Shape MNLI Test Dataset: ", movie_review_input_ids_test.shape)
print("Shape SST2 Test Dataset: ", sst2_input_ids_test.shape)
print("Shape RTE Test Dataset: ", rte_input_ids_test.shape)

Shape IMDB Train Dataset:  (10000, 512)
Shape IMDB Test Dataset:  (400, 512)


### Train the model

In [None]:
# Create the train, validation and test sets
imdb_encodings_train = {'input_ids':imdb_input_ids_train,
                      'attention_mask':imdb_attention_masks_train}
imdb_encodings_test = {'input_ids':imdb_input_ids_test,
                      'attention_mask':imdb_attention_masks_test}

imdb_train_dataset = CreateDataset(imdb_encodings_train, imdb_train_labels[:10000])
imdb_test_dataset = CreateDataset(imdb_encodings_test, imdb_test_labels[:NB_TEXTS_TEST])

In [None]:
# CONSTANTS
BATCH_SIZE = 32
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06
LEARNING_RATE = 1e-5
NUM_EPOCHS = 3
LOG_STEPS = 100

#### Train classifier

In [None]:
model_prelogits = DistilBertClassifier(device=device, prelogits=True, num_epochs=4)
model_prelogits.train_model(imdb_train_dataset, imdb_test_dataset)
model_prelogits.save_pretrained_model("distilbert_prelogits")

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassificationPreLogits: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassificationPreLogits from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassificationPreLogits from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassificationPreLogits were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.wei

Step,Training Loss
100,0.6367
200,0.2952
300,0.2689
400,0.2197
500,0.1942
600,0.1943
700,0.1634


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
100,0.6367
200,0.2952
300,0.2689
400,0.2197
500,0.1942
600,0.1943
700,0.1634
800,0.1435
900,0.1497
1000,0.1306


Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in model_trained/distilbert/config.json
Model weights saved in model_trained/distilbert/pytorch_model.bin


In [None]:
model_hidden_state = DistilBertClassifier(device=device, prelogits=False, num_epochs=4)
model_hidden_state.train_model(imdb_train_dataset, imdb_test_dataset)
model_hidden_state.save_pretrained_model("distilbert_hidden_state")

### Get Logits and Prelogits 

In [None]:
imdb_prelogits_train, imdb_logits_train = model_prelogits.get_prelogit_logit(imdb_input_ids_train.to(device), imdb_attention_masks_train.to(device), "imdb", "train", save=True)
imdb_prelogits_test, imdb_logits_test = model_prelogits.get_prelogit_logit(imdb_input_ids_test.to(device), imdb_attention_masks_test.to(device), "imdb", "test", save=True)

movie_review_prelogits_test, movie_review_logits_test = model_prelogits.get_prelogit_logit(movie_review_input_ids_test.to(device), movie_review_attention_masks_test.to(device), "movie_review", "test", save=True)
mnli_prelogits_test, mnli_logits_test = model_prelogits.get_prelogit_logit(mnli_input_ids_test.to(device), mnli_attention_masks_test.to(device), "mnli", "test", save=True)
sst2_prelogits_test, sst2_logits_test = model_prelogits.get_prelogit_logit(sst2_input_ids_test.to(device), sst2_attention_masks_test.to(device), "sst2", "test", save=True)
rte_prelogits_test, rte_logits_test = model_prelogits.get_prelogit_logit(rte_input_ids_test.to(device), rte_attention_masks_test.to(device), "rte", "test", save=True)

### Get Hidden States

In [None]:
imdb_hidden_state_train = model_hidden_state.get_hidden_layer(imdb_input_ids_train.to(device), imdb_attention_masks_train.to(device), "imdb", "train", save=True, size_array=50)
imdb_hidden_state_test = model_hidden_state.get_hidden_layer(imdb_input_ids_test.to(device), imdb_attention_masks_test.to(device), "imdb", "test", save=True, size_array=50)

movie_review_hidden_state_test = model_hidden_state.get_hidden_layer(movie_review_input_ids_test.to(device), movie_review_attention_masks_test.to(device), "movie_review", "test", save=True, size_array=50)
mnli_hidden_state_test = model_hidden_state.get_hidden_layer(mnli_input_ids_test.to(device), mnli_attention_masks_test.to(device), "mnli", "test", save=True, size_array=50)
sst2_hidden_state_test = model_hidden_state.get_hidden_layer(sst2_input_ids_test.to(device), sst2_attention_masks_test.to(device), "sst2", "test", save=True, size_array=50)
rte_hidden_state_test = model_hidden_state.get_hidden_layer(rte_input_ids_test.to(device), rte_attention_masks_test.to(device), "rte", "test", save=True, size_array=50)