<a href="https://colab.research.google.com/github/HoseinNekouei/fine_tuning_with_custom_dataset/blob/main/fine_tuning_imdb_review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Load Dataset**

In [None]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

# **Load the dataset and divide into training and testing set**

In [None]:
from posixpath import join
from pathlib import Path

def read_imdb_split(split_dir):
  split_dir= Path(split_dir)
  texts=[]
  labels=[]

  for label_dir in ['neg', 'pos']:
    for text_file in (split_dir/label_dir).iterdir():

      texts.append(text_file.read_text())
      labels.append(0 if label_dir== 'neg' else 1)

  return texts, labels

train_texts, train_labels= read_imdb_split('/content/aclImdb/train')
# test_texts, test_labels= read_imdb_split('/content/aclImdb/test')

shape=','.join(
    [f'train text length: {len(train_texts)}',
    f'train labels length: {len(train_labels)}',
    # f'test text length: {len(test_texts)}',
    # f'test labels length: {len(test_labels)}'
    ])

print(shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_texts, train_labels, test_size= 0.2, stratify= train_labels)

length= ','. join([
    f'X_train length: {len(X_train)}',
    f'X_val length: {len(X_val)}',
    f'y_train length: {len(y_train)}',
    f'y_val length: {len(y_val)}'
])

print(length)

# **Load Tokenizer**

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased-finetuned-sst-2-english')
tokenizer

In [None]:
train_encodings = tokenizer(X_train, padding=True, truncation=True, max_length= 256)
val_encodings= tokenizer(X_val, padding= True, truncation= True, max_length= 256)
# test_encodings= tokenizer(test_texts, padding= True, truncation= True, max_length= 256)

# **Make Dataset and DataLoader**

In [None]:
import torch

In [None]:
class IMDBDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings= encodings
    self.labels= labels

  def __getitem__(self, idx):
    item= {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)


train_set= IMDBDataset(train_encodings, y_train)
val_set= IMDBDataset(val_encodings, y_val)

In [None]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_set, batch_size=128, shuffle=True)
val_loader = DataLoader(val_set, batch_size=128)

In [None]:
from transformers import  AutoModelForSequenceClassification
from torch.optim import AdamW, SGD, Adam

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

checkpoint= 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
model.to(device)
model.train()

# optim = AdamW(model.parameters(), lr=5e-5)
optim = Adam(model.parameters(), lr=5e-5)
# optim = SGD(model.parameters(), lr=0.001,momentum =0.9)

In [None]:
batch =next(iter(train_loader))
input_ids= batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels= batch['labels'].to(device)

outputs= model(input_ids, attention_mask=attention_mask, labels=labels)
print(outputs.logits)
y_pred = torch.argmax(outputs.logits, dim=1)

print('y_pred',y_pred)
print('labels',labels)

acc= y_pred == labels
result= torch.sum(y_pred==labels).item()
print(acc)
print(result)



In [None]:
Path('.')

In [None]:
import math
epochs= 3
best_loss= float('inf')

for epoch in range(epochs):
  total_loss, train_loss, val_loss = 0, 0, 0
  total_acc, train_acc, val_acc = 0, 0 , 0

  for index, batch in enumerate(train_loader):

    # Using GPU
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    # model
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    y_pred = torch.argmax(outputs.logits, dim=1)

    # loss
    loss = outputs[0]

    # gradient
    loss.backward()

    # update
    optim.step()
    optim.zero_grad()

    train_loss += loss.item() * len(batch['input_ids'])
    train_acc += torch.sum(y_pred == labels).item()

    if (index +1) % 10 == 0:
      print(f' batch: [{index + 1}/{math.floor(len(train_set)/128)}]')

  train_loss /= len(train_set)
  train_acc /= len(train_set)


  with torch.no_grad():
    for batch in val_loader:

      # GPU
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      # model
      outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
      y_pred= torch.argmax(outputs.logits, dim=1)
      # loss
      loss= outputs[0]


      val_loss += loss.item() * len(batch['input_ids'])
      val_acc += torch.sum(y_pred == labels).item()

    val_loss /= len(val_set)
    val_acc /= len(val_set)

  training_result= ','.join([
    f'Epoch:[{epoch + 1}], ',
    f'train_Loss:{train_loss:.3f}, ',
    f'train_acc:{train_acc:.3f}, '
    f'val_Loss:{val_loss:.3f}, ',
    f'val_acc:{val_acc:.3f}'
  ])

  print(training_result)

  if val_loss < best_loss:
    best_loss= val_loss
    print('model saved!')
    model.save(model, 'imdb_model.pt')


In [None]:
model.eval()

# with torch.no_grad():
#     outputs = model(input_ids, attention_mask=attention_mask)
#     predictions = torch.argmax(outputs.logits, dim=-1)