In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import sys
import numpy as np
import pandas as pd
import torch
import random
import warnings
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from PIL import Image
from pathlib import Path

In [3]:
# 환경 설정
warnings.filterwarnings("ignore")
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
print("✅ Using device:", device)

✅ Using device: cuda


In [4]:
!git clone https://github.com/Meituan-AutoML/MobileVLM.git

Cloning into 'MobileVLM'...
remote: Enumerating objects: 212, done.[K
remote: Counting objects: 100% (84/84), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 212 (delta 61), reused 41 (delta 41), pack-reused 128 (from 1)[K
Receiving objects: 100% (212/212), 474.57 KiB | 4.06 MiB/s, done.
Resolving deltas: 100% (97/97), done.


In [5]:
os.chdir('/content/MobileVLM')
sys.path.append('/content/MobileVLM')

In [None]:
test_dataset=pd.read_csv('/path/to/lib/test.csv')
img_folder = '/path/to/lib'

In [7]:
test_dataset.iloc[0]

Unnamed: 0,0
ID,TEST_000
img_path,./test_input_images/TEST_000.jpg
Question,What types of fruits are visible in the image?
A,Bananas and grapes placed in baskets
B,Apples and oranges displayed on the counter
C,Peaches and plums in a wooden crate
D,Pears and lemons arranged neatly


In [8]:
system_message = """You are a multimodal assistant specialized in answering visual multiple-choice questions.
Given an image and a related question with options A, B, C, and D, select the single best answer based solely on the visual content and question.
Answer only with one of: A, B, C, or D. Do not provide explanations unless explicitly requested."""

In [9]:
model_path = "/content/drive/MyDrive/dacon/SCPC/MobileVLM/mobilevlm-2.finetune-lora"
#model_path = "mtgv/MobileVLM-3B"

image_file = img_folder + test_dataset.iloc[3]['img_path'].strip('.')
prompt_str = system_message + "\n" + f"Question: {test_dataset.iloc[3]['Question']}\n" + "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate([test_dataset.iloc[3][c] for c in ['A', 'B', 'C', 'D']])]) + "\nAnswer:"


print(image_file)
print(prompt_str)

/content/drive/MyDrive/dacon/SCPC/test_input_images/TEST_003.jpg
You are a multimodal assistant specialized in answering visual multiple-choice questions.
Given an image and a related question with options A, B, C, and D, select the single best answer based solely on the visual content and question.
Answer only with one of: A, B, C, or D. Do not provide explanations unless explicitly requested.
Question: What type of clothing are the people wearing, and what does it suggest about the climate?
A. Light clothing, suggesting a warm and sunny climate
B. Swimwear, suggesting a tropical and beach-like climate
C. Heavy coats, suggesting a cold and possibly snowy climate
D. Raincoats, suggesting rainy and wet weather conditions
Answer:


In [10]:
from mobilevlm.model.mobilevlm import load_pretrained_model
from mobilevlm.conversation import conv_templates, SeparatorStyle
from mobilevlm.utils import disable_torch_init, process_images, tokenizer_image_token, KeywordsStoppingCriteria
from mobilevlm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN

In [11]:
from transformers import AutoTokenizer, AutoConfig, BitsAndBytesConfig
from mobilevlm.model.mobilellama import MobileLlamaForCausalLM
from peft import PeftModel

def load_pretrained_model(model_path, load_8bit=False, load_4bit=False, device_map="auto", device="cuda"):
    kwargs = {"device_map": device_map}

    if load_8bit:
        kwargs['load_in_8bit'] = True
    elif load_4bit:
        kwargs['load_in_4bit'] = True
        kwargs['quantization_config'] = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type='nf4'
        )
    else:
        kwargs['torch_dtype'] = torch.float16

    lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained("mtgv/MobileVLM-3B", use_fast=False)
    model = MobileLlamaForCausalLM.from_pretrained("mtgv/MobileVLM-3B", low_cpu_mem_usage=True, **kwargs)
    token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
    if model.lm_head.weight.shape[0] != token_num:
        model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
        model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
    if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
        non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
    non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
    if any(k.startswith('model.model.') for k in non_lora_trainables):
        non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
    model.load_state_dict(non_lora_trainables, strict=False)

    model = PeftModel.from_pretrained(model, model_path)
    print('Merging LoRA weights...')
    model = model.merge_and_unload()
    print('Model is loaded...')


    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
    mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
    if mm_use_im_patch_token:
        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
    if mm_use_im_start_end:
        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
    model.resize_token_embeddings(len(tokenizer))

    vision_tower = model.get_vision_tower()
    if 'v2' in getattr(model.config, "mm_projector_type", "ldpnet"):
        vision_tower.load_image_processor()
    elif not vision_tower.is_loaded:
        vision_tower.load_model()
    vision_tower.to(device=device, dtype=torch.float16)
    image_processor = vision_tower.image_processor

    if hasattr(model.config, "max_sequence_length"):
        context_len = model.config.max_sequence_length
    else:
        context_len = 2048

    return tokenizer, model, image_processor, context_len

In [12]:
def inference(args):
    disable_torch_init()
    tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.load_8bit, args.load_4bit)

    results = []
    for _, row in tqdm(test_dataset.iterrows()):

        images = [Image.open(args.image_folder + row['img_path'].strip('.')).convert("RGB")]
        images_tensor = process_images(images, image_processor, model.config).to(model.device, dtype=torch.float16)

        args_prompt = system_message + "\n" + f"Question: {row['Question']}\n" + "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate([row[c] for c in ['A', 'B', 'C', 'D']])]) + "\nAnswer:"

        conv = conv_templates[args.conv_mode].copy()
        conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\n" + args_prompt)
        conv.append_message(conv.roles[1], None)
        prompt = conv.get_prompt()
        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
        # Input
        input_ids = (tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda())
        stopping_criteria = KeywordsStoppingCriteria([stop_str], tokenizer, input_ids)
        # Inference
        with torch.inference_mode():
            output_ids = model.generate(
                input_ids,
                images=images_tensor,
                do_sample=True if args.temperature > 0 else False,
                #temperature=args.temperature,
                top_p=args.top_p,
                num_beams=args.num_beams,
                max_new_tokens=args.max_new_tokens,
                use_cache=True,
                stopping_criteria=[stopping_criteria],
            )
        # Result-Decode
        input_token_len = input_ids.shape[1]
        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
        if n_diff_input_output > 0:
            print(f"[Warning] {n_diff_input_output} output_ids are not the same as the input_ids")
        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
        outputs = outputs.strip()
        if outputs.endswith(stop_str):
            outputs = outputs[: -len(stop_str)]
        results.append(outputs.strip())

    return results


In [13]:
args = type('Args', (), {
    "model_path": model_path,
    "image_folder": img_folder,
    "conv_mode": "v1",
    "temperature": 0,
    "top_p": None,
    "num_beams": 1,
    "max_new_tokens": 10,
    "load_8bit": False,
    "load_4bit": False,
})()

results = inference(args)

tokenizer_config.json:   0%|          | 0.00/824 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/6.07G [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.07G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Merging LoRA weights...
Model is loaded...


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

MobileLlamaForCausalLM(
  (model): MobileLlamaModel(
    (embed_tokens): Embedding(32000, 2560, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (o_proj): Linear(in_features=2560, out_features=2560, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2560,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((2560,), eps=1e-06)
      )
    )
    (norm): LlamaRMSNorm((2560,), 

852it [05:48,  2.45it/s]


In [14]:
results

['B',
 'A',
 'B',
 'C',
 'A',
 'C',
 'A',
 'D',
 'A',
 'B',
 'B',
 'B',
 'A',
 'C',
 'A',
 'A',
 'B',
 'A',
 'A',
 'C',
 'A',
 'C',
 'A',
 'A',
 'A',
 'A',
 'B',
 'C',
 'A',
 'C',
 'A',
 'C',
 'C',
 'A',
 'A',
 'D',
 'A',
 'D',
 'A',
 'A',
 'C',
 'A',
 'A',
 'C',
 'B',
 'B',
 'D',
 'C',
 'A',
 'C',
 'B',
 'D',
 'C',
 'A',
 'C',
 'C',
 'C',
 'B',
 'A',
 'C',
 'C',
 'C',
 'C',
 'C',
 'A',
 'B',
 'A',
 'A',
 'B',
 'D',
 'C',
 'C',
 'A',
 'C',
 'A',
 'A',
 'A',
 'A',
 'A',
 'C',
 'C',
 'D',
 'A',
 'D',
 'A',
 'D',
 'A',
 'A',
 'B',
 'D',
 'A',
 'D',
 'C',
 'B',
 'A',
 'A',
 'A',
 'A',
 'B',
 'D',
 'A',
 'D',
 'C',
 'A',
 'C',
 'A',
 'B',
 'D',
 'D',
 'C',
 'C',
 'B',
 'C',
 'D',
 'D',
 'C',
 'C',
 'A',
 'A',
 'A',
 'B',
 'A',
 'C',
 'B',
 'D',
 'A',
 'D',
 'D',
 'A',
 'D',
 'D',
 'A',
 'A',
 'C',
 'D',
 'D',
 'B',
 'C',
 'D',
 'A',
 'C',
 'A',
 'D',
 'A',
 'A',
 'A',
 'B',
 'C',
 'C',
 'A',
 'C',
 'B',
 'A',
 'D',
 'C',
 'C',
 'D',
 'A',
 'C',
 'B',
 'D',
 'A',
 'C',
 'A',
 'B',
 'A',
 'B'

In [15]:
for i in range(len(results)):
  if len(results[i]) > 1:
    results[i] = '?'

In [16]:
results

['B',
 'A',
 'B',
 'C',
 'A',
 'C',
 'A',
 'D',
 'A',
 'B',
 'B',
 'B',
 'A',
 'C',
 'A',
 'A',
 'B',
 'A',
 'A',
 'C',
 'A',
 'C',
 'A',
 'A',
 'A',
 'A',
 'B',
 'C',
 'A',
 'C',
 'A',
 'C',
 'C',
 'A',
 'A',
 'D',
 'A',
 'D',
 'A',
 'A',
 'C',
 'A',
 'A',
 'C',
 'B',
 'B',
 'D',
 'C',
 'A',
 'C',
 'B',
 'D',
 'C',
 'A',
 'C',
 'C',
 'C',
 'B',
 'A',
 'C',
 'C',
 'C',
 'C',
 'C',
 'A',
 'B',
 'A',
 'A',
 'B',
 'D',
 'C',
 'C',
 'A',
 'C',
 'A',
 'A',
 'A',
 'A',
 'A',
 'C',
 'C',
 'D',
 'A',
 'D',
 'A',
 'D',
 'A',
 'A',
 'B',
 'D',
 'A',
 'D',
 'C',
 'B',
 'A',
 'A',
 'A',
 'A',
 'B',
 'D',
 'A',
 'D',
 'C',
 'A',
 'C',
 'A',
 'B',
 'D',
 'D',
 'C',
 'C',
 'B',
 'C',
 'D',
 'D',
 'C',
 'C',
 'A',
 'A',
 'A',
 'B',
 'A',
 'C',
 'B',
 'D',
 'A',
 'D',
 'D',
 'A',
 'D',
 'D',
 'A',
 'A',
 'C',
 'D',
 'D',
 'B',
 'C',
 'D',
 'A',
 'C',
 'A',
 'D',
 'A',
 'A',
 'A',
 'B',
 'C',
 'C',
 'A',
 'C',
 'B',
 'A',
 'D',
 'C',
 'C',
 'D',
 'A',
 'C',
 'B',
 'D',
 'A',
 'C',
 'A',
 'B',
 'A',
 'B'

In [None]:
os.mkdir('/content/drive/MyDrive/dacon/SCPC/MobileVLM')

In [17]:
submission = pd.read_csv('/content/drive/MyDrive/dacon/SCPC/sample_submission.csv')
submission['answer'] = results
submission.to_csv('/content/drive/MyDrive/dacon/SCPC/MobileVLM/MobileVLM_submit3.csv', index=False)
print("✅ Done.")

✅ Done.
