In [None]:
!nvidia-smi
!nvcc --version

Thu Nov  7 19:08:46 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preparation

In [None]:
import pandas as pd
import random

# Define example intents and weights
base_examples = [
    {"Input": "I need a sharp edge image of a rock", "Canny Weight": 0.8, "Mask2Former Weight": 0.15, "DepthMap Weight": 0.05},
    {"Input": "I want to segment different objects", "Canny Weight": 0.2, "Mask2Former Weight": 0.7, "DepthMap Weight": 0.1},
    {"Input": "Generate a depth map for this scene", "Canny Weight": 0.1, "Mask2Former Weight": 0.2, "DepthMap Weight": 0.7},
    {"Input": "Create an image with clear boundaries", "Canny Weight": 0.75, "Mask2Former Weight": 0.2, "DepthMap Weight": 0.05},
    {"Input": "Highlight regions of interest", "Canny Weight": 0.3, "Mask2Former Weight": 0.6, "DepthMap Weight": 0.1},
    {"Input": "Show the 3D structure of the object", "Canny Weight": 0.1, "Mask2Former Weight": 0.2, "DepthMap Weight": 0.7},
    {"Input": "Detect edges in the image", "Canny Weight": 0.85, "Mask2Former Weight": 0.1, "DepthMap Weight": 0.05},
    {"Input": "Segment out the background", "Canny Weight": 0.2, "Mask2Former Weight": 0.75, "DepthMap Weight": 0.05},
    {"Input": "Give me a depth perspective", "Canny Weight": 0.05, "Mask2Former Weight": 0.1, "DepthMap Weight": 0.85},
    {"Input": "I want accurate outlines of objects", "Canny Weight": 0.7, "Mask2Former Weight": 0.25, "DepthMap Weight": 0.05},
]

# Function to systematically adjust weights based on input patterns
def generate_logical_variations(example, num_variations=100):
    variations = []
    for _ in range(num_variations):
        modified_example = example.copy()
        intent = modified_example["Input"]

        # Slight variations on the intent
        if "edge" in intent or "sharp" in intent:
            modified_example["Canny Weight"] = 0.8 + random.uniform(-0.05, 0.05)
            modified_example["Mask2Former Weight"] = 0.15 + random.uniform(-0.05, 0.05)
            modified_example["DepthMap Weight"] = 0.05 + random.uniform(-0.02, 0.02)
        elif "segment" in intent or "objects" in intent:
            modified_example["Canny Weight"] = 0.2 + random.uniform(-0.05, 0.05)
            modified_example["Mask2Former Weight"] = 0.7 + random.uniform(-0.05, 0.05)
            modified_example["DepthMap Weight"] = 0.1 + random.uniform(-0.02, 0.02)
        elif "depth" in intent or "3D" in intent:
            modified_example["Canny Weight"] = 0.1 + random.uniform(-0.02, 0.02)
            modified_example["Mask2Former Weight"] = 0.2 + random.uniform(-0.05, 0.05)
            modified_example["DepthMap Weight"] = 0.7 + random.uniform(-0.05, 0.05)
        elif "boundary" in intent or "outline" in intent:
            modified_example["Canny Weight"] = 0.75 + random.uniform(-0.05, 0.05)
            modified_example["Mask2Former Weight"] = 0.2 + random.uniform(-0.05, 0.05)
            modified_example["DepthMap Weight"] = 0.05 + random.uniform(-0.02, 0.02)
        else:
            # Generic fallback
            modified_example["Canny Weight"] = 0.4 + random.uniform(-0.1, 0.1)
            modified_example["Mask2Former Weight"] = 0.4 + random.uniform(-0.1, 0.1)
            modified_example["DepthMap Weight"] = 0.2 + random.uniform(-0.05, 0.05)

        # Normalize weights to sum to 1
        total_weight = modified_example["Canny Weight"] + modified_example["Mask2Former Weight"] + modified_example["DepthMap Weight"]
        modified_example["Canny Weight"] /= total_weight
        modified_example["Mask2Former Weight"] /= total_weight
        modified_example["DepthMap Weight"] /= total_weight

        variations.append(modified_example)
    return variations

# Generate 1000 examples with logical alignment
dataset = []
for base_example in base_examples:
    dataset.extend(generate_logical_variations(base_example, num_variations=100 // len(base_examples)))

# Convert to DataFrame
df = pd.DataFrame(dataset)

# Save as CSV
output_path = "/content/drive/MyDrive/659_Final_Project/LLM_Weights_csv/transformer_weight_suggestions_logical.csv"
df.to_csv(output_path, index=False)

# Reloading the dataset to check and clean any unwanted symbols
df_cleaned = pd.read_csv(output_path)

# Cleaning the weight columns to remove any unwanted symbols like quotes
for column in ["Canny Weight", "Mask2Former Weight", "DepthMap Weight"]:
    df_cleaned[column] = df_cleaned[column].apply(lambda x: float(str(x).replace("'", "").strip()))

# Save the cleaned dataset
cleaned_output_path = "/content/drive/MyDrive/659_Final_Project/LLM_Weights_csv/transformer_weight_suggestions_cleaned.csv"
df_cleaned.to_csv(cleaned_output_path, index=False)

cleaned_output_path

OSError: Cannot save file into a non-existent directory: '/content/drive/MyDrive/659_Final_Project/LLM_Weights_csv'

In [None]:
# Define synonyms for input generation based on keywords
synonyms = {
    "sharp edge": ["precise edge", "clear outline", "sharp boundary"],
    "segment": ["divide", "differentiate", "separate"],
    "depth map": ["3D representation", "depth information", "depth perspective"],
    "boundary": ["outline", "perimeter", "contour"],
    "object": ["item", "thing", "entity"],
}

# Function to generate synonyms-based inputs and corresponding weights
def generate_synonym_pairs(base_examples, synonyms, num_pairs=100):
    generated_data = []
    for base in base_examples:
        base_input = base["Input"]
        weights = {
            "Canny Weight": base["Canny Weight"],
            "Mask2Former Weight": base["Mask2Former Weight"],
            "DepthMap Weight": base["DepthMap Weight"],
        }
        for _ in range(num_pairs // len(base_examples)):
            modified_input = base_input
            for key, words in synonyms.items():
                for word in words:
                    if key in modified_input:
                        modified_input = modified_input.replace(key, word, 1)
                        break
            generated_data.append({"Input": modified_input, **weights})
    return generated_data

# Generate new 100 pairs
synonym_pairs = generate_synonym_pairs(base_examples, synonyms, num_pairs=100)

# Convert to DataFrame
df_synonym_pairs = pd.DataFrame(synonym_pairs)

# Save the new dataset as CSV
synonym_output_path = "/content/drive/MyDrive/659_Final_Project/LLM_Weights_csv/transformer_weight_suggestions_synonyms.csv"
df_synonym_pairs.to_csv(synonym_output_path, index=False)

synonym_output_path


'/content/drive/MyDrive/659_Final_Project/LLM_Weights_csv/transformer_weight_suggestions_synonyms.csv'

In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Path to your original CSV file
input_path = '/content/drive/MyDrive/659_Final_Project/LLM_Weights_csv/merged_transformer_weight_suggestions.csv'

# Load the CSV
data = pd.read_csv(input_path)

# Reformat the data into "Prompt" and "Response" pairs
formatted_data = pd.DataFrame({
    'Prompt': data['Input'],
    'Response': data.apply(lambda row: f"Canny: {row['Canny Weight']}, Mask2Former: {row['Mask2Former Weight']}, DepthMap: {row['DepthMap Weight']}", axis=1)
})

# Save the formatted dataset as a new CSV in Google Drive
output_path = '/content/drive/MyDrive/659_Final_Project/LLM_Weights_csv/formatted_prompt_response.csv'
formatted_data.to_csv(output_path, index=False)

print(f"Formatted dataset saved to: {output_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Formatted dataset saved to: /content/drive/MyDrive/659_Final_Project/LLM_Weights_csv/formatted_prompt_response.csv


In [None]:
import os
import random
from PIL import Image

# Define the dataset directory
dataset_dir = "/content/drive/MyDrive/659_Final_Project/659_Data"
output_dir = "/content/drive/MyDrive/659_Final_Project/659_Data/100_images"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# List all image files in the dataset directory
image_extensions = (".jpg", ".jpeg", ".png", ".bmp", ".tiff")  # Add more formats if needed
image_files = [f for f in os.listdir(dataset_dir) if f.lower().endswith(image_extensions)]

# Randomly select 100 images
selected_images = random.sample(image_files, min(100, len(image_files)))

# Rename and copy selected images
for i, image_file in enumerate(selected_images, start=1):
    original_path = os.path.join(dataset_dir, image_file)
    new_name = f"{i}.jpg"  # Change the extension if needed
    new_path = os.path.join(output_dir, new_name)

    # Open and save the image to ensure format consistency
    with Image.open(original_path) as img:
        img.save(new_path)

print(f"Renamed and saved {len(selected_images)} images to {output_dir}")


Renamed and saved 100 images to /content/drive/MyDrive/659_Final_Project/659_Data/100_images


# spaCy to Extract Key words

In [None]:
# Install spaCy and the English model
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
import spacy
import pandas as pd

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

# Load the dataset from your Google Drive
file_path = "/content/drive/MyDrive/659_Final_Project/image_captions_cleaned.csv"
data = pd.read_csv(file_path)

# Function to extract subject words
def extract_subject(caption):
    doc = nlp(caption)
    # Extract meaningful subject words (nouns, subjects, etc.)
    subjects = [token.text for token in doc if token.dep_ in ("nsubj", "nsubjpass") or token.pos_ == "NOUN"]
    return ", ".join(subjects) if subjects else "None"

# Apply the subject extraction function
data["Subject_Words"] = data["Caption"].apply(extract_subject)

# Save the updated file back to Google Drive
output_path = "/content/drive/MyDrive/659_Final_Project/image_captions_with_subject_words.csv"
data.to_csv(output_path, index=False)

print(f"Processed file saved at: {output_path}")


Processed file saved at: /content/drive/MyDrive/659_Final_Project/image_captions_with_subject_words.csv


# Code

In [None]:
!pip install datasets

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import os
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score
from nltk.translate.bleu_score import sentence_bleu
import torch

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

# Facebook Bart

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset, DatasetDict
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Load the dataset from Google Drive
input_csv = '/content/drive/MyDrive/659_Final_Project/659_Final_Dataset_300.csv'  # Path to the dataset
dataset = pd.read_csv(input_csv)

# Split the dataset into training and validation sets
train_texts, val_texts = train_test_split(dataset, test_size=0.2, random_state=42)  # 80% train, 20% validation

# Convert to Hugging Face dataset format
hf_datasets = DatasetDict({
    "train": Dataset.from_pandas(train_texts.reset_index(drop=True)),
    "validation": Dataset.from_pandas(val_texts.reset_index(drop=True))
})

# Load facebook/bart-base model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base").to("cuda")

# Data preprocessing function
def preprocess_data(examples):
    input_texts = [f"{prompt}" for prompt in examples["Prompt"]]
    target_texts = [f"{response}" for response in examples["Response"]]
    # Tokenize inputs
    model_inputs = tokenizer(input_texts, max_length=128, truncation=True, padding="max_length")
    # Tokenize targets
    labels = tokenizer(target_texts, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_datasets = hf_datasets.map(preprocess_data, batched=True, remove_columns=hf_datasets["train"].column_names)

# Define Data Collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    label_pad_token_id=-100  # Ignore padding tokens
)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/659_Final_Project/Models/finetuned_bart",  # New output directory
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Save checkpoint every epoch
    learning_rate=5e-6,  # Reduce learning rate for stability
    per_device_train_batch_size=16,  # Larger batch size
    per_device_eval_batch_size=16,
    num_train_epochs=50,  # Reduced epochs
    save_total_limit=2,  # Retain the latest 2 checkpoints
    logging_dir="/content/drive/MyDrive/659_Final_Project/Models/logs",  # Logging directory
    logging_steps=50,  # Log less frequently
    fp16=True,  # Mixed precision for speed
    predict_with_generate=True,
    weight_decay=0.01,  # Regularization to avoid overfitting
    load_best_model_at_end=True,  # Automatically select the best model
    metric_for_best_model="eval_loss",
    greater_is_better=False,  # Lower eval_loss is better
    report_to=["none"],  # Disable external logging
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,  # Pass tokenizer for Seq2Seq tasks
)

# Start training
trainer.train()

# Save the final fine-tuned model and tokenizer to a new Google Drive directory
final_model_dir = "/content/drive/MyDrive/659_Final_Project/Models/finetuned_bart_final"
os.makedirs(final_model_dir, exist_ok=True)  # Ensure the directory exists

# Save the final model and tokenizer
trainer.save_model(final_model_dir)  # Saves both model and tokenizer
tokenizer.save_pretrained(final_model_dir)  # Ensures the tokenizer is saved

print(f"Final model and tokenizer saved to: {final_model_dir}")

# Save the current checkpoint to a new Google Drive directory
checkpoint_dir = "/content/drive/MyDrive/659_Final_Project/Models/current_checkpoint"
os.makedirs(checkpoint_dir, exist_ok=True)  # Ensure the directory exists

# Save the model, tokenizer, and trainer state at the current checkpoint
trainer.save_state()  # Saves the trainer's state, useful for resuming training
model.save_pretrained(checkpoint_dir)  # Saves the model
tokenizer.save_pretrained(checkpoint_dir)  # Saves the tokenizer
trainer.save_model(checkpoint_dir)  # Saves a copy of the model's final checkpoint

print(f"Checkpoint saved to: {checkpoint_dir}")


Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,No log,12.425632
2,No log,11.151276
3,No log,10.100567
4,12.404700,8.7166
5,12.404700,6.255392
6,12.404700,5.127558
7,7.584100,4.546968
8,7.584100,4.155219
9,7.584100,3.861094
10,4.914700,3.624149


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Final model and tokenizer saved to: /content/drive/MyDrive/659_Final_Project/Models/finetuned_bart_final
Checkpoint saved to: /content/drive/MyDrive/659_Final_Project/Models/current_checkpoint


# Test Facebook Bart

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the tokenizer and model from the saved directory
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/659_Final_Project/Models/finetuned_bart_resume/checkpoint-150")
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/659_Final_Project/Models/finetuned_bart_resume/checkpoint-150").to("cuda")

# Test input
test_input = "A kitchen with a stove, oven, refrigerator and rug"

# Tokenize and generate output
inputs = tokenizer(test_input, return_tensors="pt", truncation=True, padding="max_length", max_length=256).to("cuda")
outputs = model.generate(inputs["input_ids"], max_length=256, num_beams=8, early_stopping=False)

# Decode the generated output
predicted_procedure = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the result
print(f"Input: {test_input}")
print(f"Predicted Procedure: {predicted_procedure}")


Input: A kitchen with a stove, oven, refrigerator and rug
Predicted Procedure: Keywords: kitchen, stove, oven, refrigerator, rug; Canny: 0.89, Segmentation: 1.0, Depth: 0


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset, DatasetDict
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Path to the checkpoint
checkpoint_dir = "/content/drive/MyDrive/659_Final_Project/Models/current_checkpoint"

# Load tokenizer and model from the checkpoint
tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_dir).to("cuda")

# Load the dataset again (assuming it's unchanged)
input_csv = '/content/drive/MyDrive/659_Final_Project/659_Final_Dataset_300.csv'  # Path to the dataset
dataset = pd.read_csv(input_csv)

# Split into training and validation sets
train_texts, val_texts = train_test_split(dataset, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset format
hf_datasets = DatasetDict({
    "train": Dataset.from_pandas(train_texts.reset_index(drop=True)),
    "validation": Dataset.from_pandas(val_texts.reset_index(drop=True))
})

# Preprocess the data
def preprocess_data(examples):
    input_texts = [f"{prompt}" for prompt in examples["Prompt"]]
    target_texts = [f"{response}" for response in examples["Response"]]
    model_inputs = tokenizer(input_texts, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(target_texts, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = hf_datasets.map(preprocess_data, batched=True, remove_columns=hf_datasets["train"].column_names)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    label_pad_token_id=-100
)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/659_Final_Project/Models/finetuned_bart_resume",  # New output directory
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-6,  # Keep learning rate low for continued training
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,  # Set epochs for resuming
    save_total_limit=2,
    logging_dir="/content/drive/MyDrive/659_Final_Project/Models/logs_resume",
    logging_steps=50,
    fp16=True,
    predict_with_generate=True,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=["none"],
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Resume training
trainer.train()

# Save the resumed model
final_model_dir = "/content/drive/MyDrive/659_Final_Project/Models/finetuned_bart_final_resume"
os.makedirs(final_model_dir, exist_ok=True)
trainer.save_model(final_model_dir)
tokenizer.save_pretrained(final_model_dir)

print(f"Resumed model and tokenizer saved to: {final_model_dir}")


Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,No log,0.876824
2,No log,0.7908
3,No log,0.717464
4,1.161900,0.65896
5,1.161900,0.613123
6,1.161900,0.575354
7,0.876400,0.547632
8,0.876400,0.527857
9,0.876400,0.51528
10,0.751900,0.509818


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Resumed model and tokenizer saved to: /content/drive/MyDrive/659_Final_Project/Models/finetuned_bart_final_resume


In [None]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

# Load the BLIP base processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")

# Load an image from a URL
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

# Conditional image captioning
text = "a photography of"
inputs = processor(raw_image, text, return_tensors="pt").to("cuda")

out = model.generate(**inputs)
print("Conditional Caption:", processor.decode(out[0], skip_special_tokens=True))

# Unconditional image captioning
inputs = processor(raw_image, return_tensors="pt").to("cuda")

out = model.generate(**inputs)
print("Unconditional Caption:", processor.decode(out[0], skip_special_tokens=True))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]



Conditional Caption: a photography of a woman and her dog on the beach
Unconditional Caption: a woman sitting on the beach with her dog


# Some still might be useful work

## Generate Conditional Captions for Bedroom Images

In [None]:
import os
import pandas as pd
from PIL import Image
from transformers import AutoProcessor, AutoModelForImageTextToText

# Paths
renamed_dir = "/content/drive/MyDrive/659_Final_Project/659_Data/100_images"
output_csv = "/content/drive/MyDrive/659_Final_Project/image_captions.csv"

# Load the BLIP model and processor
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = AutoModelForImageTextToText.from_pretrained("Salesforce/blip-image-captioning-large")

# Initialize a list to store results
results = []

# Process each image
for i in range(1, 101):  # Assuming images are named 1.jpg to 100.jpg
    image_path = os.path.join(renamed_dir, f"{i}.jpg")
    print(f"Processing {image_path}...")  # Log progress

    try:
        # Open and resize the image
        with Image.open(image_path) as img:
            img = img.resize((512, 512))  # Resize for consistent input size

            # Process the image and generate caption
            inputs = processor(images=img, return_tensors="pt")
            print(f"Inputs processed for {image_path}")  # Debugging

            outputs = model.generate(**inputs)
            print(f"Caption generated for {image_path}")  # Debugging

            caption = processor.decode(outputs[0], skip_special_tokens=True)

            # Append results
            results.append({"Image": f"{i}.jpg", "Caption": caption})

    except Exception as e:
        print(f"Error processing {image_path}: {e}")

# Save results to CSV
df = pd.DataFrame(results)
df.to_csv(output_csv, index=False)

print(f"Captions saved to {output_csv}")


Processing /content/drive/MyDrive/659_Final_Project/659_Data/100_images/1.jpg...
Inputs processed for /content/drive/MyDrive/659_Final_Project/659_Data/100_images/1.jpg




Caption generated for /content/drive/MyDrive/659_Final_Project/659_Data/100_images/1.jpg
Processing /content/drive/MyDrive/659_Final_Project/659_Data/100_images/2.jpg...
Inputs processed for /content/drive/MyDrive/659_Final_Project/659_Data/100_images/2.jpg
Caption generated for /content/drive/MyDrive/659_Final_Project/659_Data/100_images/2.jpg
Processing /content/drive/MyDrive/659_Final_Project/659_Data/100_images/3.jpg...
Inputs processed for /content/drive/MyDrive/659_Final_Project/659_Data/100_images/3.jpg
Caption generated for /content/drive/MyDrive/659_Final_Project/659_Data/100_images/3.jpg
Processing /content/drive/MyDrive/659_Final_Project/659_Data/100_images/4.jpg...
Inputs processed for /content/drive/MyDrive/659_Final_Project/659_Data/100_images/4.jpg
Caption generated for /content/drive/MyDrive/659_Final_Project/659_Data/100_images/4.jpg
Processing /content/drive/MyDrive/659_Final_Project/659_Data/100_images/5.jpg...
Inputs processed for /content/drive/MyDrive/659_Final_Pr

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load Flan-T5 model and tokenizer
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")

# Load the captions CSV file
captions_df = pd.read_csv("/content/drive/MyDrive/659_Data/captions.csv")

# Define a function to generate weights for each caption
def generate_weights(caption):
    # Construct the input prompt with a specific example format
    prompt = f"{caption}\nGenerate conditioning weights in the format [0.3, 0.5, 0.2]:"

    # Tokenize and generate output
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(inputs["input_ids"], max_length=20)

    # Decode the output and clean up
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("Generated text:", generated_text)  # Print for inspection

    # Extract weights if they follow a list format
    weight_start = generated_text.find("[")
    weight_end = generated_text.find("]", weight_start) + 1
    weights = generated_text[weight_start:weight_end] if weight_start != -1 and weight_end != -1 else "N/A"

    return weights

# Apply the function to each caption and save results
captions_df["generated_weights"] = captions_df["caption"].apply(generate_weights)

# Save the results with generated weights
captions_df.to_csv("/content/drive/MyDrive/659_Data/captions_with_weights.csv", index=False)
print("Generated weights saved to /content/drive/MyDrive/659_Data/captions_with_weights.csv")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Generated text: [0.0, 0.0, 0.0]
Generated text: [0.3, 0.5, 0.2]
Generated text: [0.3, 0.5, 0.2]
Generated text: [0.0, 0.0, 0.0]
Generated text: [0.3, 0.5, 0.2]
Generated text: [0.0, 0.0, 0.0]
Generated text: [0.3, 0.5, 0.2]
Generated text: [0.3, 0.5, 0.2]
Generated text: [0.0, 0.0, 0.0]
Generated text: [0.3, 0.5, 0.2]
Generated text: [0.0, 0.0, 0.0]
Generated text: [0.3, 0.5, 0.2]
Generated text: [0.3, 0.5, 0.2]
Generated text: [0.3, 0.5, 0.2]
Generated text: [0.0, 0.0, 0.0]
Generated text: [0.0, 0.0, 0.0]
Generated text: [0.0, 0.0, 0.0]
Generated text: [0.0, 0.0, 0.0]
Generated text: [0.0, 0.0, 0.0]
Generated text: [0.3, 0.5, 0.2]
Generated text: [0.0, 0.0, 0.0]
Generated text: [0.0, 0.0, 0.0]
Generated text: [0.3, 0.5, 0.2]
Generated text: [0.3, 0.5, 0.2]
Generated text: [0.3, 0.5, 0.2]
Generated text: [0.3, 0.5, 0.2]
Generated text: [0.3, 0.5, 0.2]
Generated text: [0.3, 0.5, 0.2]
Generated text: [0.0, 0.0, 0.0]
Generated text: [0.0, 0.0, 0.0]
Generated text: [0.0, 0.0, 0.0]
Generate