In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
pip install pandas numpy seaborn matplotlib scikit-learn torch transformers datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import torch
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          Trainer,
                          TrainingArguments)
from datasets import Dataset, DatasetDict
from evaluate import load
from transformers import pipeline

In [None]:
df = pd.read_csv("/kaggle/input/imbd-reviws-preproced/imdb_reviews_preproced.csv")
df.sample(5)

Unnamed: 0,review,sentiment
46794,ok let get clear really sci fi reason love sta...,1
2844,never seen show much story mystery suspense ha...,1
3661,okay know like movie pat morita loveable inter...,1
22346,hard film rate truly deserves 3 perhaps even t...,0
13521,since cartoon made old days felix talks using ...,1


In [None]:
reviews = df['review'].tolist()
labels = df['sentiment'].tolist()

In [None]:
train_reviews, val_reviews, train_labels, val_labels = train_test_split(reviews, labels, test_size=0.2, random_state=42)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# Function for tokenizing the reviews

def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True)

In [None]:
train_encodings = tokenize_function(train_reviews)
val_encodings = tokenize_function(texts=val_reviews)

In [None]:
# Convert to Hugging Face Dataset format

train_dataset = Dataset.from_dict({
                                    'input_ids': train_encodings['input_ids'],
                                    'attention_mask': train_encodings['attention_mask'],
                                    'labels': train_labels
                                    })

val_dataset = Dataset.from_dict({
                                    'input_ids': val_encodings['input_ids'],
                                    'attention_mask': val_encodings['attention_mask'],
                                    'labels': val_labels
                                    })

dataset = DatasetDict({
                        'train': train_dataset,
                        'validation': val_dataset
                        })

Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base", num_labels=2)


model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define the evaluation metric

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
     evaluation_strategy='epoch',     # Evaluate at each epoch
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    compute_metrics=compute_metrics,
)

trainer.train()
# We are going to get multiple loss values on each training step here


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112763933336585, max=1.0…

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2344,0.275377,0.8969
2,0.1617,0.217356,0.92


TrainOutput(global_step=2500, training_loss=0.25021154356002806, metrics={'train_runtime': 2310.1197, 'train_samples_per_second': 34.63, 'train_steps_per_second': 1.082, 'total_flos': 1.059739189248e+16, 'train_loss': 0.25021154356002806, 'epoch': 2.0})

In [None]:
rsultat = trainer.evaluate()


In [None]:
print(rsultat)

{'eval_loss': 0.21735620498657227, 'eval_accuracy': 0.92, 'eval_runtime': 86.279, 'eval_samples_per_second': 115.903, 'eval_steps_per_second': 3.628, 'epoch': 2.0}
