# Imports 

In [3]:
from glob import glob
import PIL

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, f1_score, log_loss

import torch
from torchvision import transforms as T
from transformers import AutoModelForImageClassification, AutoImageProcessor

# Hugging Face Hub

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Data

In [4]:
train_df = pd.read_csv('../data/train.csv')

In [5]:
# glob, test folder
test_paths = [pth.split("/")[-1] for pth in glob('../data/test/*')]
test_df = pd.DataFrame(sorted(test_paths, key=lambda x: int(x.split(".")[0])), columns=["id"])

### Splits

In [6]:
seed= 42
train_df, validation_df = train_test_split(
    train_df, test_size=0.1, stratify=train_df["label"].values, random_state=seed
)
validation_df, holdout_df = train_test_split(
    validation_df, test_size=0.5, stratify=validation_df["label"].values, random_state=seed
)

In [7]:
train_df.reset_index(drop=True, inplace=True)
validation_df.reset_index(drop=True, inplace=True)
holdout_df.reset_index(drop=True, inplace=True)

### Data loaders

In [8]:
label2id = {"NO_AI": 0, "AI": 1}
id2label = {0: "NO_AI", 1: "AI"}

class data(torch.utils.data.Dataset):
    def __init__(self, train_labels: pd.DataFrame = train_df, split_name:str = 'train', aug_transforms=None):
        self.train_labels = train_labels
        self.index = train_labels.index
        self.split_name = split_name
        self.aug_transforms = aug_transforms
        
    def __len__(self):
        return len(self.index)
    
    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()
            
        try :
            name = self.train_labels.loc[index, "id"]
            if self.split_name == "train":
                label = self.train_labels.loc[index, "label"]    
        except IndexError:
            raise IndexError('Index out of range')
        
        path = f'../data/{self.split_name}/{name}'
        # image = plt.imread(path)
        image = PIL.Image.open(path)
        image = self.aug_transforms(image)
        
        if self.split_name == "train":
            return {"img_path": path, "image": image, "label": label}
        else:
            return {"img_path": path, "image": image}
    
    def info(self):
        print(f'Number of images: {len(self)}')
        print(f'Classes: {self.train_labels["label"].unique()}')
        print(f'Images shape : {self[0][0].shape}')
    

### model

In [9]:
# model_ckpt = "microsoft/swin-tiny-patch4-window7-224"
model_ckpt = "microsoft/beit-base-patch16-224-pt22k-ft22k"
model = AutoModelForImageClassification.from_pretrained(
    model_ckpt, 
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes = True,
)
image_processor = AutoImageProcessor.from_pretrained(model_ckpt)

Some weights of BeitForImageClassification were not initialized from the model checkpoint at microsoft/beit-base-patch16-224-pt22k-ft22k and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([21841, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([21841]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


### transformations

In [10]:
mean, std = image_processor.image_mean, image_processor.image_std
size = image_processor.size["height"]


train_transforms = T.Compose([
    T.RandomResizedCrop(size),
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize(mean=mean, std=std)
])
test_transforms = T.Compose([
    T.Resize(size),
    T.CenterCrop(size),
    T.ToTensor(),
    T.Normalize(mean=mean, std=std)
])

In [11]:
train_dataset = data(train_df, split_name='train', aug_transforms=train_transforms)
validation_dataset = data(validation_df, split_name='train', aug_transforms=test_transforms)
holdout_dataset = data(holdout_df, split_name='test', aug_transforms=test_transforms)
test_dataset = data(test_df, split_name='test', aug_transforms=test_transforms)

# Training

In [12]:
from transformers import TrainingArguments


model_name = model_ckpt.split("/")[-1]
batch_size = 64

args = TrainingArguments(
    f"{model_name}-aiornot-simple",
    remove_unused_columns=False,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    # learning_rate=5e-5,
    # learning_rate=4e-5,
    learning_rate=7e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    # num_train_epochs=3,
    num_train_epochs=20,
    warmup_ratio=0.1,
    logging_steps=10,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)


In [13]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    logloss = log_loss(labels, preds)
    return {"accuracy": acc, "f1": f1, "log_loss": logloss}

In [14]:
def collate_fn(examples):
    pixel_values = torch.stack([example["image"] for example in examples])
    try :
        labels = torch.tensor([example["label"] for example in examples])
        return {"pixel_values": pixel_values, "labels": labels}
    except :
        return {"pixel_values": pixel_values}

In [15]:
from transformers import Trainer 

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

/home/studio-lab-user/sagemaker-studiolab-notebooks/projects/hg_airornot/Notebooks/beit-base-patch16-224-pt22k-ft22k-aiornot-simple is already a clone of https://huggingface.co/mustapha/beit-base-patch16-224-pt22k-ft22k-aiornot-simple. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
train_results = trainer.train()

***** Running training *****
  Num examples = 16756
  Num Epochs = 20
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 4
  Total optimization steps = 1300
  Number of trainable parameters = 85763522


Epoch,Training Loss,Validation Loss,Loss,Accuracy,F1,Runtime,Samples Per Second,Steps Per Second
0,0.1811,2.013179,0.143033,0.944146,0.942982,14.1412,65.836,1.061


***** Running Evaluation *****
  Num examples = 931
  Batch size = 64
Saving model checkpoint to beit-base-patch16-224-pt22k-ft22k-aiornot-simple/checkpoint-65
Configuration saved in beit-base-patch16-224-pt22k-ft22k-aiornot-simple/checkpoint-65/config.json
Model weights saved in beit-base-patch16-224-pt22k-ft22k-aiornot-simple/checkpoint-65/pytorch_model.bin
Image processor saved in beit-base-patch16-224-pt22k-ft22k-aiornot-simple/checkpoint-65/preprocessor_config.json
Image processor saved in beit-base-patch16-224-pt22k-ft22k-aiornot-simple/preprocessor_config.json
Deleting older checkpoint [beit-base-patch16-224-pt22k-ft22k-aiornot-simple/checkpoint-1170] due to args.save_total_limit


# Evaluation

In [None]:
trainer.evaluate(holdout_dataset)

In [None]:
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=256, pin_memory=True
)
device = "cuda" if torch.cuda.is_available() else "cpu"
_ = model.to(device)

In [None]:
file_paths = []
# pred_ids = []

for batch in test_loader:
    image_paths = batch["img_path"]
    image_paths = [x.split("/")[-1] for x in image_paths]
    file_paths.extend(image_paths)
    
#     images = batch["image"].to(device)
#     inputs = {"pixel_values": images}
    
#     with torch.no_grad(): 
#         # logits = model(**inputs).logits
#         logits = trainer.predict(**inputs).logits

#     # predictions = logits.argmax(-1).cpu().numpy().tolist()
#     predictions = torch.nn.Softmax(dim=1)(logits)[:,1].cpu().numpy().tolist()
#     pred_ids.extend(predictions)

In [None]:
test_dataset = data(test_df, split_name='test', aug_transforms=test_transforms)

In [None]:
all_predictions = torch.nn.functional.softmax(torch.tensor(trainer.predict(test_dataset).predictions), dim=1)[:,1].cpu().numpy().tolist()

In [None]:
# pred_ids = [1-x for x in pred_ids]

In [None]:
# submission_df = pd.DataFrame({"id": file_paths, "label": pred_ids})
submission_df = pd.DataFrame({"id": file_paths, "label": all_predictions})
submission_df.head()

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

submission_df.to_csv(f"{TIMESTAMP}.csv", index=False)

In [None]:
trainer.state.log_history

In [None]:
train_results