In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/vietnamese-foods/LICENSE
/kaggle/input/vietnamese-foods/Urls/Banh gio.txt
/kaggle/input/vietnamese-foods/Urls/Bun bo Hue.txt
/kaggle/input/vietnamese-foods/Urls/Banh beo.txt
/kaggle/input/vietnamese-foods/Urls/Banh khot.txt
/kaggle/input/vietnamese-foods/Urls/Banh canh.txt
/kaggle/input/vietnamese-foods/Urls/Banh trang nuong.txt
/kaggle/input/vietnamese-foods/Urls/Banh can.txt
/kaggle/input/vietnamese-foods/Urls/Bun rieu.txt
/kaggle/input/vietnamese-foods/Urls/Cao lau.txt
/kaggle/input/vietnamese-foods/Urls/Banh tet.txt
/kaggle/input/vietnamese-foods/Urls/Hu tieu.txt
/kaggle/input/vietnamese-foods/Urls/Bun thit nuong.txt
/kaggle/input/vietnamese-foods/Urls/Ca kho to.txt
/kaggle/input/vietnamese-foods/Urls/Com tam.txt
/kaggle/input/vietnamese-foods/Urls/Pho.txt
/kaggle/input/vietnamese-foods/Urls/Goi cuon.txt
/kaggle/input/vietnamese-foods/Urls/Bun dau mam tom.txt
/kaggle/input/vietnamese-foods/Urls/Xoi xeo.txt
/kaggle/input/vietnamese-foods/Urls/Chao long.txt
/kaggle/inpu

In [14]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import (CenterCrop, Compose, Normalize, RandomHorizontalFlip,
                                    RandomResizedCrop, Resize, ToTensor)
from transformers import ViTForImageClassification, ViTImageProcessor, TrainingArguments, Trainer
import pandas as pd
from PIL import Image
import numpy as np
import os

In [15]:
TRAIN_PATH_30VNFOODS = '/kaggle/input/vietnamese-foods/Images/Train'
VALIDATE_PATH_30VNFOODS = '/kaggle/input/vietnamese-foods/Images/Validate'
TEST_PATH_30VNFOODS = '/kaggle/input/vietnamese-foods/Images/Test'

TRAIN_PATH_CUSTOMDATA = '/kaggle/input/custom-food-dataset/FOOD-DATASET/TRAIN'
VALIDATE_PATH_CUSTOMDATA = '/kaggle/input/custom-food-dataset/FOOD-DATASET/VAL'
TEST_PATH_CUSTOMDATA = '/kaggle/input/custom-food-dataset/FOOD-DATASET/TEST'

In [16]:
def create_dataframe(base_path):
    data = []
    for class_name in os.listdir(base_path):
        class_path = os.path.join(base_path, class_name)
        if os.path.isdir(class_path):
            for img_name in os.listdir(class_path):
                img_path = os.path.join(class_path, img_name)
                if os.path.isfile(img_path):
                    data.append([img_path, class_name])
    return pd.DataFrame(data, columns=['filename', 'class'])

In [17]:
# Merge DataFrames
df_train = pd.concat([create_dataframe(TRAIN_PATH_30VNFOODS), create_dataframe(TRAIN_PATH_CUSTOMDATA)])
df_validate = pd.concat([create_dataframe(VALIDATE_PATH_30VNFOODS), create_dataframe(VALIDATE_PATH_CUSTOMDATA)])
df_test = pd.concat([create_dataframe(TEST_PATH_30VNFOODS), create_dataframe(TEST_PATH_CUSTOMDATA)])

# Reset indices
df_train.reset_index(drop=True, inplace=True)
df_validate.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [18]:
# Get unique classes
all_classes = pd.concat([df_train['class'], df_validate['class'], df_test['class']]).unique()
num_labels = len(all_classes)
id2label = {id:label for id, label in enumerate(all_classes)}
label2id = {label:id for id, label in id2label.items()}

In [19]:
model_name = "google/vit-large-patch16-384"
processor = ViTImageProcessor.from_pretrained(model_name)
model = ViTForImageClassification.from_pretrained(model_name,
                                                  num_labels=num_labels,
                                                  id2label=id2label,
                                                  label2id=label2id,
                                                  ignore_mismatched_sizes=True)

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-384 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 1024]) in the checkpoint and torch.Size([38, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([38]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
from torchvision.transforms import RandomHorizontalFlip

class FoodDataset(Dataset):
    def __init__(self, dataframe, processor, is_train=True):
        self.dataframe = dataframe
        self.processor = processor
        self.is_train = is_train
        self.random_flip = RandomHorizontalFlip(p=0.5) if is_train else None

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['filename']
        image = Image.open(img_path).convert('RGB')

        # Apply random flip for training if enabled
        if self.is_train and self.random_flip:
            image = self.random_flip(image)

        # Use processor for preprocessing
        inputs = self.processor(images=image, return_tensors="pt")
        inputs = {k: v.squeeze() for k, v in inputs.items()}

        label = label2id[self.dataframe.iloc[idx]['class']]
        inputs['labels'] = torch.tensor(label)

        return inputs

In [21]:
# Create datasets
train_dataset = FoodDataset(df_train, processor, is_train=True)
val_dataset = FoodDataset(df_validate, processor, is_train=False)

In [22]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

In [23]:
training_args = TrainingArguments(
    output_dir="./vit-l-16-food-classifier",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    fp16=True,
    report_to='none',
    gradient_accumulation_steps=4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [24]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.428266,0.893362
2,0.798200,0.344351,0.908596
3,0.798200,0.319963,0.915851


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=879, training_loss=0.49830239461957393, metrics={'train_runtime': 14698.733, 'train_samples_per_second': 3.827, 'train_steps_per_second': 0.06, 'total_flos': 4.534902001025306e+19, 'train_loss': 0.49830239461957393, 'epoch': 3.0})

In [25]:
!pip install evaluate

  pid, fd = os.forkpty()


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [26]:
import evaluate

metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    
    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"],
        "precision": precision["precision"],
        "recall": recall["recall"]
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [27]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:")
for key, value in results.items():
    print(f"{key}: {value}")

# Get predictions on the validation set
predictions = trainer.predict(val_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Evaluation Results:
eval_loss: 0.3199634552001953
eval_accuracy: 0.9158505622052956
eval_runtime: 268.3379
eval_samples_per_second: 10.274
eval_steps_per_second: 0.645
epoch: 3.0


In [28]:
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

In [29]:
# Generate classification report
class_names = list(label2id.keys())
report = classification_report(y_true, y_pred, target_names=class_names)
print("\nClassification Report:")
print(report)

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')

# Use the actual class names for tick labels
plt.xticks(np.arange(len(class_names)) + 0.5, class_names, rotation=45, ha='right')
plt.yticks(np.arange(len(class_names)) + 0.5, class_names, rotation=0)

plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.close()

print("Confusion matrix saved as 'confusion_matrix.png'")

# If you want to print the confusion matrix values
print("\nConfusion Matrix:")
print(cm)


Classification Report:
                  precision    recall  f1-score   support

       Banh cuon       0.92      0.94      0.93       114
        Mi quang       0.92      0.98      0.95        89
Banh trang nuong       0.94      0.94      0.94        80
        Goi cuon       0.92      0.94      0.93        85
        Banh gio       0.88      0.91      0.89        64
         Bun mam       0.95      0.90      0.92        77
       Canh chua       0.88      0.96      0.92        83
        Nem chua       0.90      0.85      0.88        54
        Bun rieu       0.94      0.83      0.88       115
        Banh duc       0.80      0.80      0.80        66
        Banh pia       0.98      0.98      0.98        45
       Banh canh       0.83      0.82      0.83        97
       Banh khot       0.90      0.95      0.92        84
    Banh bot loc       0.92      0.84      0.88        73
        Banh can       0.96      0.92      0.94        75
  Bun thit nuong       0.92      0.95      0.93