<a href="https://colab.research.google.com/github/MuhammadIrzam447/MultiModel/blob/master/Train_34.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Created this notebook for the ViT fine tuning for multi-label classification

In [None]:
!pip install transformers evaluate datasets

In [None]:
import requests
import torch
from PIL import Image
from transformers import *
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
model_name = "google/vit-base-patch16-224"
image_processor = ViTImageProcessor.from_pretrained(model_name)
model = ViTForImageClassification.from_pretrained(model_name).to(device)

# Loading a Custom Dataset

Training Dataset

In [None]:
import os
image_file_paths = []
genre_labels = []

image_folder_add = "/content/Dataset(s)/mmimdb_new/train"
labels_file = "/content/Dataset(s)/mmimdb_new/train_label.txt"

with open(labels_file, 'r') as file:
    for line in file:
        parts = line.strip().split('|')
        filename = parts[0].strip()
        labels = parts[1].strip().split(', ')  # Split labels by comma and remove leading/trailing spaces
        image_path = os.path.join(image_folder_add, filename)
        image_file_paths.append(image_path)
        genre_labels.append(labels)

In [None]:
from collections import defaultdict

label_counts = defaultdict(int)

for labels in genre_labels:
    for label in labels:
        label_counts[label] += 1


label_count_list = [(label, count) for label, count in label_counts.items()]
sorted_label_count_list = sorted(label_count_list, key=lambda x: x[1], reverse=True)

for label, count in sorted_label_count_list:
    print(f"{label}: {count}")

print("Total Labels: ", len(label_count_list))

Drama: 16848
Comedy: 10216
Romance: 6452
Thriller: 6226
Crime: 4586
Action: 4310
Adventure: 3222
Horror: 3206
Documentary: 2468
Mystery: 2462
Sci-Fi: 2424
Fantasy: 2324
Family: 1956
War: 1612
Biography: 1576
History: 1360
Music: 1268
Animation: 1172
Musical: 1006
Western: 846
Sport: 758
Short: 562
Film-Noir: 404
News: 78
Talk-Show: 4
Reality-TV: 2
Total Labels:  26


In [None]:
min_label_count = 400
valid_labels = [label for label, count in label_counts.items() if count >= min_label_count]
valid_labels = sorted(list(valid_labels))

In [None]:
valid_labels, len(valid_labels)

(['Action',
  'Adventure',
  'Animation',
  'Biography',
  'Comedy',
  'Crime',
  'Documentary',
  'Drama',
  'Family',
  'Fantasy',
  'Film-Noir',
  'History',
  'Horror',
  'Music',
  'Musical',
  'Mystery',
  'Romance',
  'Sci-Fi',
  'Short',
  'Sport',
  'Thriller',
  'War',
  'Western'],
 23)

In [None]:
multi_hot_labels = []

for labels in genre_labels:
    multi_hot = [1. if label in labels else 0 for label in valid_labels]
    multi_hot_labels.append(multi_hot)

In [None]:
from datasets import Dataset

train_data = {'image': image_file_paths, 'label': multi_hot_labels}
ds_train = Dataset.from_dict(train_data)

In [None]:
ds_train

Dataset({
    features: ['image', 'label'],
    num_rows: 31104
})

Validation Dataset

In [None]:
import os

test_image_file_paths = []
test_genre_labels = []

image_folder_add = "/content/Dataset(s)/mmimdb_new/test"
labels_file = "/content/Dataset(s)/mmimdb_new/test_label.txt"

with open(labels_file, 'r') as file:
    for line in file:
        parts = line.strip().split('|')
        filename = parts[0].strip()
        labels = parts[1].strip().split(', ')  # Split labels by comma and remove leading/trailing spaces

        if not (filename.endswith("_1.png") or filename.endswith("_2.png")):
            image_path = os.path.join(image_folder_add, filename)
            test_image_file_paths.append(image_path)
            test_genre_labels.append(labels)

In [None]:
len(test_image_file_paths)

15598

In [None]:
test_image_file_paths[0:10]

In [None]:
test_multi_hot_labels = []

for labels in test_genre_labels:
    multi_hot = [1. if label in labels else 0 for label in valid_labels]
    test_multi_hot_labels.append(multi_hot)

In [None]:
val_data = {'image': test_image_file_paths, 'label': test_multi_hot_labels}
ds_val = Dataset.from_dict(val_data)

In [None]:
ds_val

Dataset({
    features: ['image', 'label'],
    num_rows: 15598
})

# Just Image Training

In [None]:
# Use _3.png for Filtering out Encoded Images
# Use _4.png for Filtering out Just Images

import os
def filter_funtion(example):
    img = example["image"]
    filename = os.path.basename(img.filename)

    return filename.endswith("_4.png")

In [None]:
ds_train = ds_train.filter(filter_funtion)

In [None]:
ds_val = ds_val.filter(filter_funtion)

# Exploring the Data

In [None]:
# labels = ds_train.features["label"]
# labels
labels = valid_labels
labels

['Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Thriller',
 'War',
 'Western']

In [None]:
# labels.int2str(ds_train[532]["label"])

# Preprocessing the Data

In [None]:
import PIL.Image as pil

def transform(examples):
  # inputs = image_processor([img.convert("RGB") for img in examples["image"]], return_tensors="pt")
  inputs = image_processor([pil.open(img).convert("RGB") for img in examples["image"]], return_tensors="pt")
  inputs["labels"] = examples["label"]
  return inputs

In [None]:
# use the with_transform() method to apply the transform to the dataset on the fly during training
train_dataset = ds_train.with_transform(transform)
val_dataset = ds_val.with_transform(transform)

In [None]:
for item in train_dataset:
  print(item["pixel_values"].shape)
  print(item["labels"])
  break

torch.Size([3, 224, 224])
[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]


In [None]:
# # extract the labels for our dataset
# labels = ds_train.features["label"].names
# labels

In [None]:
import torch

def collate_fn(batch):
  return {
      "pixel_values": torch.stack([x["pixel_values"] for x in batch]),
      "labels": torch.tensor([x["labels"] for x in batch]),
  }

In [None]:
train_dataset

Dataset({
    features: ['image', 'label'],
    num_rows: 31104
})

In [None]:
val_dataset

Dataset({
    features: ['image', 'label'],
    num_rows: 15598
})

# Defining the Metrics

In [None]:
# from evaluate import load
# import numpy as np
# from sklearn.metrics import roc_auc_score

# # load the accuracy and f1 metrics from the evaluate module
# accuracy = load("accuracy")
# f1 = load("f1")

# def compute_metrics(eval_pred):
#   # compute the accuracy and f1 scores & return them
#   accuracy_score = accuracy.compute(predictions=np.argmax(eval_pred.predictions, axis=1), references=eval_pred.label_ids)
#   f1_score = f1.compute(predictions=np.argmax(eval_pred.predictions, axis=1), references=eval_pred.label_ids, average="macro")

#   # auroc_score = roc_auc_score(eval_pred.label_ids, np.argmax(eval_pred.predictions, axis=1))
#   # print(f"AUROC Score: {auroc_score:.4f}")

#   return {**accuracy_score, **f1_score}

In [None]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    label_ids = eval_pred.label_ids

    # sigmoid_predictions = torch.sigmoid(logits)
    logits_tensor = torch.tensor(logits)
    sigmoid_predictions = torch.sigmoid(logits_tensor)

    threshold = 0.5
    thresholded_predictions = (sigmoid_predictions > threshold).cpu().numpy().astype(int)


    accuracy = accuracy_score(label_ids, thresholded_predictions)
    f1 = f1_score(label_ids, thresholded_predictions, average="macro")

    return {"accuracy": accuracy, "f1": f1}


# Training the Model

In [None]:
# load the ViT model
model = ViTForImageClassification.from_pretrained(
    model_name,
    num_labels= len(valid_labels),
    label2id = {label: str(i) for i, label in enumerate(valid_labels)},
    id2label = {str(i): label for i, label in enumerate(valid_labels)},
    problem_type = "multi_label_classification",
    ignore_mismatched_sizes=True,
)

In [None]:
# !pip install accelerate -U

In [None]:
# !pip install transformers[torch]

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="/content/Model/Models-Train-34", # output directory
  per_device_train_batch_size=32, # batch size per device during training
  evaluation_strategy="steps",    # evaluation strategy to adopt during training
  num_train_epochs=25,             # total number of training epochs
  # fp16=True,                    # use mixed precision
  save_steps=4000,                # number of update steps before saving checkpoint
  eval_steps=4000,                # number of update steps before evaluating
  logging_steps=4000,             # number of update steps before logging
  # save_steps=50,
  # eval_steps=50,
  # logging_steps=50,
  save_total_limit=4,             # limit the total amount of checkpoints on disk
  remove_unused_columns=False,    # remove unused columns from the dataset
  push_to_hub=False,              # do not push the model to the hub
  report_to='tensorboard',        # report metrics to tensorboard
  load_best_model_at_end=True,    # load the best model at the end of training
)


Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices


In [None]:
train_dataset

Dataset({
    features: ['image', 'label'],
    num_rows: 31104
})

In [None]:
val_dataset

Dataset({
    features: ['image', 'label'],
    num_rows: 15598
})

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,                        # the instantiated 🤗 Transformers model to be trained
    args=training_args,                 # training arguments, defined above
    data_collator=collate_fn,           # the data collator that will be used for batching
    compute_metrics=compute_metrics,    # the metrics function that will be used for evaluation
    train_dataset=train_dataset,        # training dataset
    eval_dataset=val_dataset,           # evaluation dataset
    tokenizer=image_processor,          # the processor that will be used for preprocessing the images
)

In [None]:
# start training
trainer.train()

***** Running training *****
  Num examples = 31,104
  Num Epochs = 25
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 24,300
  Number of trainable parameters = 85,816,343


Step,Training Loss,Validation Loss,Accuracy,F1
4000,0.1979,0.247535,0.134697,0.341829
8000,0.0881,0.336976,0.124567,0.361222
12000,0.0224,0.463562,0.126683,0.353561
16000,0.0035,0.55839,0.123926,0.353041
20000,0.0016,0.608531,0.125016,0.352843


***** Running Evaluation *****
  Num examples = 15598
  Batch size = 8
Saving model checkpoint to /content/Model/Models-Train-34/checkpoint-4000
Configuration saved in /content/Model/Models-Train-34/checkpoint-4000/config.json
Model weights saved in /content/Model/Models-Train-34/checkpoint-4000/pytorch_model.bin
Image processor saved in /content/Model/Models-Train-34/checkpoint-4000/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 15598
  Batch size = 8
Saving model checkpoint to /content/Model/Models-Train-34/checkpoint-8000
Configuration saved in /content/Model/Models-Train-34/checkpoint-8000/config.json
Model weights saved in /content/Model/Models-Train-34/checkpoint-8000/pytorch_model.bin
Image processor saved in /content/Model/Models-Train-34/checkpoint-8000/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 15598
  Batch size = 8
Saving model checkpoint to /content/Model/Models-Train-34/checkpoint-12000
Configuration saved in /conten

Buffered data was truncated after reaching the output size limit.