In [None]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
from transformers import BertModel, BertTokenizer
import IPython
import matplotlib
import matplotlib.pyplot as plt
import requests
import torchaudio
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%matplotlib inline

In [None]:
def process_image(image_path):
  model = models.resnet18(pretrained=True)
  num_features = model.fc.in_features
  model.fc = torch.nn.Linear(num_features, 5)
  image = Image.open(image_path)
  transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
  ])
  input_tensor = transform(image)
  input_batch = input_tensor.unsqueeze(0)

  model.eval()
  with torch.no_grad():
      output = model(input_batch)

  return output

In [None]:
def process_text(text_path):
    model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    with open(text_path, 'r') as file:
        text = file.read()

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state

    features = torch.mean(last_hidden_states, dim=1).squeeze()

    n_features = 5
    if features.numel() == 0 or features.shape[0] < n_features:
        return torch.zeros(n_features)

    section_length = features.shape[0] // n_features
    reduced_vector = torch.zeros(n_features)
    for i in range(n_features):
        start_index = i * section_length
        end_index = start_index + section_length
        if i == n_features - 1:
            end_index = features.shape[0]
        reduced_vector[i] = torch.mean(features[start_index:end_index])

    return reduced_vector

In [None]:
def process_training_set(directory, output_file_path):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    with open(output_file_path, 'w') as out_file:
        for folder_name in os.listdir(directory):
            folder_path = os.path.join(directory, folder_name)
            if os.path.isdir(folder_path):
                image_file = next((f for f in os.listdir(folder_path) if f.startswith('photo')), None)
                text_file = next((f for f in os.listdir(folder_path) if f.startswith('text')), None)
                label_file = next((f for f in os.listdir(folder_path) if f.endswith('.txt') and 'label' in f), None)

                if image_file and text_file and label_file:
                    image_path = os.path.join(folder_path, image_file)
                    text_path = os.path.join(folder_path, text_file)
                    label_path = os.path.join(folder_path, label_file)

                    image_features = process_image(image_path).to(device)
                    text_features = process_text(text_path).to(device)

                    combined_features = torch.cat((image_features.squeeze(0), text_features.squeeze(0)), dim=0)

                    with open(label_path, 'r') as label_file:
                        label = label_file.read().strip()

                    feature_string = ' '.join([str(f.item()) for f in combined_features])
                    out_file.write(f"{feature_string} {label}\n")
