# Imports

In [1]:
import pandas as pd

In [2]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.utils.data import Dataset

In [3]:
from sklearn.model_selection import train_test_split

In [50]:
import torch
print(torch.__version__)

1.12.1


In [48]:
torch.version.cuda

In [45]:
if torch.cuda.is_available():
    print("CUDA is available. Training on GPU.")
    device = torch.device("cuda")
else:
    print("CUDA is not available. Training on CPU.")
    device = torch.device("cpu")


CUDA is not available. Training on CPU.


In [4]:
data = pd.read_feather("../data/movie_reviews_4k.feather")

In [5]:
data.shape

(4000, 2)

In [6]:
data

Unnamed: 0,text,label
0,I wanted to vote zero or lower. I loved the co...,0
1,"Karen(Bobbie Phillips)mentions, after one of h...",0
2,This review applies for the cut of the film th...,0
3,"The best film on the battle of San Antonio, Te...",1
4,"In theory, 'Director's Commentary' should have...",0
...,...,...
3995,Excellent show. Instead of watching the same o...,1
3996,"It's hard to believe an ""action"" packed Jet Li...",0
3997,Me and my girlfriend went to see this movie as...,0
3998,This movie is my all time favorite!!! You real...,1


## Load Tokenizer

In [31]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

## Train test split

In [38]:
texts = data['text'].tolist()
labels = data['label'].tolist()

encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)

df_encodings = pd.DataFrame({'input_ids': encodings['input_ids'], 
                             'attention_mask': encodings['attention_mask'], 
                             'labels': labels})

In [54]:
train_df, val_df = train_test_split(df_encodings.head(100), test_size=0.2, random_state=42)

In [55]:
train_dataset = SentimentDataset({'input_ids': train_df['input_ids'].tolist(), 
                                  'attention_mask': train_df['attention_mask'].tolist()}, 
                                 train_df['labels'].tolist())

val_dataset = SentimentDataset({'input_ids': val_df['input_ids'].tolist(), 
                                'attention_mask': val_df['attention_mask'].tolist()}, 
                               val_df['labels'].tolist())

## Load Pretrained model

In [56]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

loading configuration file config.json from cache at C:\Users\Jan Majewski/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411\config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at C:\Users\Jan Majewski/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411\pytorch_model.bin
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing Dis

## Setup training Hyperparams

In [59]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=1,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=50,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


## Train and evaluate

In [None]:
trainer.train()

***** Running training *****
  Num examples = 80
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5
  Number of trainable parameters = 66955010


In [None]:
trainer.evaluate()

In [26]:
print(train_dataset[0])

{'input_ids': tensor([  101,  1045,  2359,  2000,  3789,  5717,  2030,  2896,  1012,  1045,
         3866,  1996,  8570,  1012,  2009,  2003,  1996,  5409,  3185,  2412,
         2081,  1998,  1005, 16655,  4859, 23086,  1005,  2003,  1996,  3819,
         2773,  2005,  2009,  1010,  4983,  2045,  2003,  2242,  4788,  2008,
        20996, 18150,  2196,  2245,  1997,  1012,  1045,  2572,  2036,  2012,
         1037,  3279,  2000,  2228,  1997,  2505,  4997,  2438,  2000, 14125,
         6235,  8945,  7256,  1012,  1996,  2190,  2008,  2071,  2022,  2056,
         1997,  2014,  2003,  1010,  2016,  1005,  1055,  8335,  1012,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

## Make classifications

In [43]:
pred_sentences  =[ "I absolutely hate this movie, total dissaster", 
                  "Most beaytiful movie ever, I watched it 10 times, very good"]

In [None]:
new_texts = ["Your new text for sentiment analysis here."]
encoded_input = tokenizer(new_texts, return_tensors='pt', padding=True, truncation=True, max_length=128)

In [None]:
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

In [89]:
def classify_sentences(model, sentences):
    sententes_tokenized = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
    model_outputs = model(sententes_tokenized)
    sentiment_proba =  tf.nn.softmax(model_outputs[0], axis=-1)
    sentiment_class = tf.argmax(sentiment_proba, axis=1)
    sentiment_labels = [ 'Negative', 'Positive']
    sentiments = [sentiment_labels[i] for i in sentiment_class ]
    
    for i in range(0,len(sentences)):
        print(f"{sentences[i]} : \033[1m {sentiment_labels[i]}\033[0m")
    
    return sentiment_proba, sentiments

In [76]:
tf.argmax(sentiment_proba, axis=1)

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([0, 1], dtype=int64)>

In [97]:
sentiment_proba , sentimetns = classify_sentences(model, pred_sentences)

I absolutely hate this movie, total dissaster : [1m Negative[0m
Most beaytiful movie ever, I watched it 10 times, very good : [1m Positive[0m
