In [1]:
import os
from PIL import Image

In [None]:
#*************************************
#STEP 1 – Preprocess Data
#📂 Match Images and Descriptions
#*****************************************

In [2]:
image_dir = "/kaggle/input/monuments-firsttry/train/train"
desc_dir = "/kaggle/input/monuments-firsttry/descriptions (1)"

In [3]:
print("Images:", os.listdir(image_dir)[:3])
print("Descriptions:", os.listdir(desc_dir)[:3])

Images: ['images_1682_jpg.rf.4ad8805be4dd22efbd71fe2c1efe7531.jpg', 'images_68_jpg.rf.9b7e267389d61249fdbfb389461d220c.jpg', 'images_4083_jpg.rf.5cb223b54f1e2c50b1a8eea470233031.jpg']
Descriptions: ['images_4214_jpg.txt', 'images_5519_jpg.txt', 'images_1333_jpg.txt']


In [4]:
paired_data = []

In [5]:
# Extract the prefix from the image filenames
image_prefixes = [img_file.split('.')[0] for img_file in os.listdir(image_dir) if img_file.endswith('.jpg')]

In [6]:
# Extract the prefixes from the description filenames
desc_prefixes = [desc_file.split('.')[0] for desc_file in os.listdir(desc_dir) if desc_file.endswith('.txt')]

In [7]:
# Iterate over image prefixes and find matching descriptions
for img_prefix in image_prefixes:
    # Check if the description with the same prefix exists
    if img_prefix in desc_prefixes:
        # Construct the paths
        img_file = img_prefix + '.jpg'
        desc_file = img_prefix + '.txt'
        
        img_path = os.path.join(image_dir, img_file)
        desc_path = os.path.join(desc_dir, desc_file)
        
        # Read the description
        with open(desc_path, 'r', encoding='utf-8') as f:
            description = f.read()
        
        # Append to paired data
        paired_data.append({
            "image_path": img_path,
            "description": description
        })

In [8]:
print(f"Number of paired data: {len(paired_data)}")

Number of paired data: 1000


In [9]:
print(len(paired_data))

1000


In [None]:
#********************************************
#STEP 2 – Text Chunking (for RAG)
#Use chunking + overlap to enable granular retrieval.
#*******************************************

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [11]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50
)

In [13]:
chunks = []
for item in paired_data:
    for chunk in text_splitter.split_text(item["description"]):
        chunks.append({
            "chunk": chunk,
            "source_img": item["image_path"]
        })

In [14]:
if len(chunks) > 0:
    print(chunks[0])

{'chunk': 'Image: images_1682_jpg.rf.4ad8805be4dd22efbd71fe2c1efe7531.jpg\nDescription: Here is a detailed description of the monument in the image:\n\n**Name of the Monument:** Golghar\n\n**History:** The Golghar is a large granary located in Patna, Bihar, India. It was built by Captain John Garstin of the British East India Company to prevent famines after the devastating famine of 1770.\n\n**Who Built It:** Captain John Garstin on behalf of the British East India Company.', 'source_img': '/kaggle/input/monuments-firsttry/train/train/images_1682_jpg.jpg'}


In [15]:
!pip install faiss-cpu --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h

In [None]:
#**********************************************
#STEP 3 – Embed Chunks for Retrieval (RAG Base)
#Use Sentence Transformers to embed text into a vector store (FAISS).
#****************************************************

In [16]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

2025-04-21 01:10:15.790483: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745197815.990605      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745197816.048447      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [17]:
model = SentenceTransformer("all-MiniLM-L6-v2")
texts = [c["chunk"] for c in chunks]
vectors = model.encode(texts)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/192 [00:00<?, ?it/s]

In [18]:
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(np.array(vectors))

In [None]:
#STEP 4 – Retrieval (RAG Style)
#At inference time, retrieve similar descriptions.

In [19]:
query = "monument with scalloped arches and sandstone material"
query_vec = model.encode([query])
_, indices = index.search(np.array(query_vec), k=5)

retrieved = [texts[i] for i in indices[0]]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
#STEP 5 – LoRA + PEFT Finetuning

In [20]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForCausalLM, AutoTokenizer

In [23]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
base_model = "mistralai/Mistral-7B-v0.1"
model = AutoModelForCausalLM.from_pretrained(base_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [25]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

In [26]:
lora_model = get_peft_model(model, peft_config)

In [None]:
#STEP 6 – Generate Image from Caption (Reconstruction)

In [27]:
from diffusers import StableDiffusionPipeline

In [28]:
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
pipe = pipe.to("cuda")

model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.72k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [29]:
prompt = "A reddish-pink observatory structure with sundials and staircases, based on Rajput architecture."
image = pipe(prompt).images[0]
image.save("reconstructed.jpg")

  0%|          | 0/50 [00:00<?, ?it/s]

In [30]:
prompt = "A majestic stone castle on a cliff overlooking the sea, with a drawbridge and turrets, under a cloudy sky."
image = pipe(prompt).images[0]
image.save("new_reconstructed.jpg")

  0%|          | 0/50 [00:00<?, ?it/s]

In [32]:
# New theme-based prompt
prompt = "A majestic ancient monument with intricate carvings and large stone columns, partially reconstructed with modern restoration techniques, blending historical architecture with modern elements."

# Generate the image with the new prompt
image = pipe(prompt).images[0]

# Save the reconstructed image
image.save("reconstructed_monument.jpg")


  0%|          | 0/50 [00:00<?, ?it/s]

In [33]:
# New theme-based prompt
prompt = "An ancient Roman temple with grand arches, marble statues, and detailed engravings, being carefully reconstructed using advanced techniques to restore its former glory while preserving the historical integrity of the structure."

# Generate the image with the new prompt
image = pipe(prompt).images[0]

# Save the reconstructed image
image.save("reconstructed_roman_temple.jpg")


  0%|          | 0/50 [00:00<?, ?it/s]

In [35]:
# New theme-based prompt
prompt = "A partially ruined ancient Buddhist temple nestled in the mountains, with tiered pagoda roofs, stone lion statues, and faded golden ornaments, being digitally reconstructed to visualize its original sacred structure and peaceful ambiance"

# Generate the image with the new prompt
image = pipe(prompt).images[0]

# Save the reconstructed image
image.save("reconstructed_Buddhist_temple.jpg")


  0%|          | 0/50 [00:00<?, ?it/s]