In [1]:
import torch
import torchvision.transforms as transforms
from transformers import AutoTokenizer
from PIL import Image
import pandas as pd

# Load project dataset
df = pd.read_csv("/content/sample_data/multimodal_dataset.csv")

# Define relevant columns based on the dataset
TEXT_COLUMN = "text"
IMAGE_COLUMN = "image_path"
NUMERICAL_COLUMNS = "numeric_feature"
LABEL_COLUMN = "label"

# Tokenize text data
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_text(text):
    return tokenizer(
        text, return_tensors="pt", padding=True, truncation=True
    )

# Preprocess images
def preprocess_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])
    image = Image.open(image_path).convert("RGB")
    return transform(image).unsqueeze(0)

# Example usage
text_tokens = tokenize_text(df[TEXT_COLUMN].iloc[0])
image_tensor = preprocess_image(df[IMAGE_COLUMN].iloc[0])

print("Image Shape:", image_tensor.shape, "Text Tokens:", text_tokens)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Image Shape: torch.Size([1, 3, 224, 224]) Text Tokens: {'input_ids': tensor([[ 101, 2023, 2003, 1037, 3861, 1997, 1037, 2482,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [2]:
from transformers import AutoModel, VisionEncoderDecoderModel
import torch
import torch.nn as nn

class MultiModalModel(nn.Module):
    def __init__(self, text_model_name, image_model_name, numerical_input_size, output_classes):
        super().__init__()
        # Select appropriate text model
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        # Select appropriate image model
        self.image_encoder = torch.hub.load("pytorch/vision", image_model_name, pretrained=True)
        self.image_encoder.fc = nn.Identity()
        # Numerical feature processing
        self.fc_numeric = nn.Linear(numerical_input_size, 128)
        # Final classifier
        self.fc_combined = nn.Linear(512 + 768 + 128, output_classes)

    def forward(self, text_tokens, image_tensor, numerical_data):
        text_features = self.text_encoder(**text_tokens).last_hidden_state[:, 0, :]
        image_features = self.image_encoder(image_tensor)
        numeric_features = self.fc_numeric(numerical_data)
        combined = torch.cat((text_features, image_features, numeric_features), dim=1)
        return self.fc_combined(combined)

# Create the model instance with project-relevant choices
model = MultiModalModel(
    text_model_name="bert-base-uncased",
    image_model_name="resnet18",
    numerical_input_size=len(NUMERICAL_COLUMNS),
    output_classes=len(df[LABEL_COLUMN].unique())
)

print("Model Ready:", model)



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading: "https://github.com/pytorch/vision/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 147MB/s]

Model Ready: MultiModalModel(
  (text_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,),




In [3]:
import os
import torch

# Save the trained model
torch.save(model.state_dict(), "multi_modal_model.pth")
print("Model saved successfully")

# Verify model file
if os.path.exists("multi_modal_model.pth"):
    print("Model file found")
else:
    print("Model file is missing. Train and save it again.")



Model saved successfully
Model file found


In [4]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.42.0-py2.py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[

In [28]:
%%writefile app.py
import streamlit as st
import torch
from PIL import Image
import torchvision.transforms as transforms
from transformers import AutoTokenizer

# Import or define your MultiModalModel
from model import MultiModalModel  # Replace with actual model import if needed

# Define necessary variables (Replace with actual values)
NUMERICAL_COLUMNS = "numeric_feature"
LABEL_COLUMN = "label"
df = None  # Replace with actual dataframe if used

st.title("Multi-Modal Prediction")

# Load trained model
try:
    model = MultiModalModel(
        text_model_name="bert-base-uncased",
        image_model_name="resnet18",
        numerical_input_size=len(NUMERICAL_COLUMNS),
        output_classes=2  # Replace with actual number of classes
    )
    model.load_state_dict(torch.load("multi_modal_model.pth", map_location=torch.device("cpu")))
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
except Exception as e:
    st.error(f"Error loading model: {e}")
    st.stop()

# Input fields
symptoms = st.text_area("Enter Symptoms")
heart_rate = st.number_input("Heart Rate", min_value=50, max_value=200, value=80)
temperature = st.number_input("Body Temperature", min_value=30.0, max_value=45.0, value=37.0)
image_file = st.file_uploader("Upload Image", type=["jpg", "png"])

if st.button("Predict"):
    if symptoms and image_file:
        try:
            image_path = "uploaded_image.jpg"
            with open(image_path, "wb") as f:
                f.write(image_file.read())

            # Process input
            text_tokens = tokenizer(symptoms, return_tensors="pt", padding=True, truncation=True)
            image = Image.open(image_path).convert("RGB")
            transform = transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor()
            ])
            image_tensor = transform(image).unsqueeze(0)
            numerical_data = torch.tensor([[heart_rate, temperature]], dtype=torch.float32)

            # Model Prediction
            with torch.no_grad():
                prediction = model(text_tokens, image_tensor, numerical_data)
                diagnosis = "Positive" if torch.argmax(prediction, dim=1).item() == 1 else "Negative"

            st.success(f"Prediction: {diagnosis}")
        except Exception as e:
            st.error(f"Error processing input: {e}")
    else:
        st.warning("Please enter symptoms and upload an image")


Overwriting app.py


In [29]:
ls

app.py  multi_modal_model.pth  [0m[01;34msample_data[0m/


In [None]:
!npx localtunnel --port 8501


[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K

In [None]:
!pip install pyngrok
