In [1]:
import os
import sys
import os.path as op
from torch.nn import functional as F
import numpy as np
import random

from nltk import word_tokenize
from torch.nn.utils.rnn import pad_sequence

In [2]:
# For those on google colab: you can download the files directly with this:
import gdown
gdown.download("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", output="aclImdb_v1.tar.gz", quiet=False)
!tar xzf /content/aclImdb_v1.tar.gz

Downloading...
From: http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
To: /content/aclImdb_v1.tar.gz
100%|██████████| 84.1M/84.1M [00:09<00:00, 8.80MB/s]


In [2]:
from glob import glob
# We get the files from the path: ./aclImdb/train/neg for negative reviews, and ./aclImdb/train/pos for positive reviews
train_filenames_neg = sorted(glob(op.join('.', 'aclImdb', 'train', 'neg', '*.txt')))
train_filenames_pos = sorted(glob(op.join('.', 'aclImdb', 'train', 'pos', '*.txt')))

test_filenames_neg = sorted(glob(op.join('.', 'aclImdb', 'test', 'neg', '*.txt')))
test_filenames_pos = sorted(glob(op.join('.', 'aclImdb', 'test', 'pos', '*.txt')))

# Each files contains a review that consists in one line of text: we put this string in two lists, that we concatenate
train_texts_neg = [open(f, encoding="utf8").read() for f in train_filenames_neg]
train_texts_pos = [open(f, encoding="utf8").read() for f in train_filenames_pos]
train_texts = train_texts_neg + train_texts_pos

test_texts_neg = [open(f, encoding="utf8").read() for f in test_filenames_neg]
test_texts_pos = [open(f, encoding="utf8").read() for f in test_filenames_pos]
test_texts = test_texts_neg + test_texts_pos


# The first half of the elements of the list are string of negative reviews, and the second half positive ones
# We create the labels, as an array of [1,len(texts)], filled with 1, and change the first half to 0
train_labels = np.ones(len(train_texts), dtype=int)
train_labels[:len(train_texts_neg)] = 0.


test_labels = np.ones(len(test_texts), dtype=int)
test_labels[:len(test_texts_neg)] = 0.

In [3]:
#from sklearn.model_selection import train_test_split

In [4]:
#train_texts_splt, val_texts, train_labels_splt, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [5]:
#!pip install transformers

In [6]:
import transformers
import keras
from transformers import BertTokenizer, BertModel
#from transformers import DistilBertTokenizerFast
import torch

In [7]:
from transformers import BertTokenizer

# Check if a GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


# Tokenize the input sentences
encoded_train = tokenizer.batch_encode_plus(
    train_texts[:200],
    add_special_tokens=True,
    #max_length=512,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)
encoded_test = tokenizer.batch_encode_plus(
    test_texts[:100],
    add_special_tokens=True,
    #max_length=512,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

# Get the input IDs, attention masks, and token type IDs
input_train = encoded_train['input_ids'].to(device)
input_test = encoded_test['input_ids'].to(device)

attention_masks_train = encoded_train['attention_mask'].to(device)
token_type_ids_train = encoded_train['token_type_ids'].to(device)

attention_masks_test = encoded_test['attention_mask'].to(device)
token_type_ids_test = encoded_test['token_type_ids'].to(device)
#print(input_ids)
#print(attention_masks)


In [9]:
from transformers import BertModel
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Get the prelogits of the train dataset
with torch.no_grad():
    outputs_train = model(input_train, attention_masks_train, token_type_ids_train)
    prelogits_train = outputs_train[1].cpu().numpy()

    outputs_test = model(input_test, attention_masks_test, token_type_ids_test)
    prelogits_test = outputs_test[1].cpu().numpy()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
prelogits_train.shape

(200, 768)

In [12]:
prelogits_test.shape

(100, 768)

In [27]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [30]:
from transformers import AutoTokenizer, BertForSequenceClassification

In [38]:
model_classif = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-yelp-polarity", output_hidden_states=True)

In [33]:
model_classif

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:


# Define the training hyperparameters
training_args = TrainingArguments(
    output_dir='./results',        
    per_device_train_batch_size=32, 
    per_device_eval_batch_size=32,
    num_train_epochs=3,                   
    weight_decay=0.01,
    warmup_steps=500,
    logging_steps = 100)

trainer = Trainer(
    model=model,                        
    args=training_args,                  
    train_dataset=train_dataset,      
    eval_dataset=val_dataset)

trainer.train()