# Demonstration - Get Data - Train Model - 20Newsgroup

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip install git+https://github.com/Julien2048/NLP_Project_OOD.git
!git clone https://github.com/Julien2048/NLP_Project_OOD
!pip install -r NLP_Project_OOD/requirements.txt

In [None]:
%cd gdrive/MyDrive/NLP_Project_OOD/distilbert
!rm /usr/local/lib/python3.8/dist-packages/transformers/models/distilbert/modeling_distilbert.py
!cp modeling_distilbert.py /usr/local/lib/python3.8/dist-packages/transformers/models/distilbert/

!rm /usr/local/lib/python3.8/dist-packages/transformers/models/distilbert/__init__.py
!cp __init__.py /usr/local/lib/python3.8/dist-packages/transformers/models/distilbert/
%cd ..

%cd transformers
!rm /usr/local/lib/python3.8/dist-packages/transformers/__init__.py
!cp __init__.py /usr/local/lib/python3.8/dist-packages/transformers/
%cd ..
%cd ..
%cd ..
%cd ..

In [3]:
import torch
from nlp_project.data_loader import NewsgroupDataset, CreateDataset
from nlp_project.OOD_detector import Mahalanobis
from nlp_project.metrics import ResultsOOD
from nlp_project.BERT_adaptation import TokenizeData, DistilBertClassifier

## Load Reuters Datasets

### Download Reuters Data

In [4]:
get_newsgroup = NewsgroupDataset()
newsgroup_in_train_texts, newsgroup_in_test_texts, newsgroup_out_test_texts, newsgroup_in_train_labels, newsgroup_in_test_labels, newsgroup_out_test_labels = get_newsgroup.get_dataset()
get_newsgroup.save_labels()
get_newsgroup.save_texts()

In [None]:
print("Len Newsgroup In Train Dataset: ", len(newsgroup_in_train_labels))
print("Len Newsgroup In Test Dataset: ", len(newsgroup_in_test_labels))
print("Len Newsgroup Out Test Dataset: ", len(newsgroup_out_test_labels))

## Prelogits extraction

In [6]:
# Check if a GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

### Tokenize data

In [7]:
NB_TEXTS_TRAIN = None
NB_TEXTS_TEST = 1600

#### Create tokens

In [None]:
tokenizer = TokenizeData()
newsgroup_in_input_ids_train, newsgroup_in_attention_masks_train = tokenizer(newsgroup_in_train_texts, "newsgroup_in", "train", NB_TEXTS_TRAIN)
tokenizer.save_tokens()

newsgroup_in_input_ids_test, newsgroup_in_attention_masks_test = tokenizer(newsgroup_in_test_texts, "newsgroup_in", "test", NB_TEXTS_TEST)
tokenizer.save_tokens()

newsgroup_out_input_ids_test, newsgroup_out_review_attention_masks_test = tokenizer(newsgroup_out_test_texts, "newsgroup_out", "test", NB_TEXTS_TEST)
tokenizer.save_tokens()

### Train the model

In [9]:
# Create the train, validation and test sets
newsgroup_in_encodings_train = {'input_ids':newsgroup_in_input_ids_train,
                      'attention_mask':newsgroup_in_attention_masks_train}
newsgroup_in_encodings_test = {'input_ids':newsgroup_in_input_ids_test,
                      'attention_mask':newsgroup_in_attention_masks_test}

newsgroup_in_train_dataset = CreateDataset(newsgroup_in_encodings_train, newsgroup_in_train_labels)
newsgroup_in_test_dataset = CreateDataset(newsgroup_in_encodings_test, newsgroup_in_test_labels[:NB_TEXTS_TEST])

In [10]:
# CONSTANTS
BATCH_SIZE = 32
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06
LEARNING_RATE = 1e-5
NUM_EPOCHS = 3
LOG_STEPS = 100

#### Train Classifier

In [11]:
model_prelogits = DistilBertClassifier(device=device, training=True, num_epochs=4, num_labels=15)
model_prelogits.train_model(newsgroup_in_train_dataset, newsgroup_in_test_dataset)
model_prelogits.save_pretrained_model("distilbert_reuters")

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassificationPreLogits: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassificationPreLogits from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassificationPreLogits from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassificationPreLogits were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 

Step,Training Loss
100,2.591
200,1.8177
300,1.2449
400,0.9155
500,0.7633
600,0.6609
700,0.5711
800,0.51
900,0.4929
1000,0.4511


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in model_trained/distilbert_reuters/config.json
Model weights saved in model_trained/distilbert_reuters/pytorch_model.bin


In [None]:
model_hidden_state = DistilBertClassifier(device=device, prelogits=False, num_epochs=4)
model_hidden_state.train_model(newsgroup_in_train_dataset, newsgroup_in_test_dataset)
model_hidden_state.save_pretrained_model("distilbert_hidden_state")

### Get Logits and Prelogits 

In [34]:
newsgroup_in_prelogits_train, newsgroup_in_logits_train = model_prelogits.get_prelogit_logit(newsgroup_in_input_ids_train.to(device), newsgroup_in_attention_masks_train.to(device), "newsgroup_in", "train", save=True)
newsgroup_in_prelogits_test, newsgroup_in_logits_test = model_prelogits.get_prelogit_logit(newsgroup_in_input_ids_test.to(device), newsgroup_in_attention_masks_test.to(device), "newsgroup_in", "test", save=True)

newsgroup_out_prelogits_test, newsgroup_out_logits_test = model_prelogits.get_prelogit_logit(newsgroup_out_input_ids_test.to(device), newsgroup_out_review_attention_masks_test.to(device), "newsgroup_out", "test", save=True)

### Get Hidden Layer

In [None]:
newsgroup_in_hidden_state_train = model_hidden_state.get_hidden_layer(newsgroup_in_input_ids_train.to(device), newsgroup_in_attention_masks_train.to(device), "newsgroup_in", "train", save=True, size_array=50)
newsgroup_in_hidden_state_test = model_hidden_state.get_hidden_layer(newsgroup_in_input_ids_test.to(device), newsgroup_in_attention_masks_test.to(device), "newsgroup_in", "test", save=True, size_array=50)

newsgroup_out_hidden_state_test = model_hidden_state.get_hidden_layer(newsgroup_out_input_ids_test.to(device), newsgroup_out_review_attention_masks_test.to(device), "newsgroup_out", "test", save=True, size_array=50)