In [1]:
pip install transformers sentence-transformers faiss-cpu torch torchvision


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.meta

In [4]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration

# Correct model for BLIP-2 (OPT version)
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    torch_dtype=torch.float16,
    device_map="auto"
)


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

In [6]:
from PIL import Image
import torch

# Function to generate caption from image
def generate_caption(image_path):
    raw_image = Image.open(image_path).convert('RGB')
    inputs = processor(images=raw_image, return_tensors="pt").to("cuda", torch.float16)
    generated_ids = model.generate(**inputs)
    caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return caption


In [7]:
from sentence_transformers import SentenceTransformer
import faiss

# Example context documents
context_docs = [
    "This device is used to regulate gas flow in pipelines.",
    "Positioners are used to control actuators by receiving feedback.",
    "AI helps predict valve failure before it happens.",
    "Industrial actuators adjust based on process variables.",
    "Sensor feedback is essential in process automation."
]

# Load Sentence Transformer and create FAISS index
sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
doc_embeddings = sentence_model.encode(context_docs, convert_to_numpy=True)

index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)

# Function to retrieve top-k similar context docs
def retrieve_context(query, k=3):
    query_vector = sentence_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_vector, k)
    return [context_docs[i] for i in indices[0]]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load Falcon-7B-Instruct
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")
llm = AutoModelForCausalLM.from_pretrained(
    "tiiuae/falcon-7b-instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)

# Function to generate final answer from image, context, and question
def generate_answer(question, caption, retrieved_docs):
    context = "\n".join(retrieved_docs)
    prompt = f"Image: {caption}\nContext:\n{context}\nQuestion: {question}\nAnswer:"

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = llm.generate(**inputs, max_new_tokens=200)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]



In [17]:
from google.colab import files

# Upload a file from your computer
uploaded = files.upload()


Saving AdobeStock_1359903951_Preview.jpeg to AdobeStock_1359903951_Preview.jpeg


In [18]:
# Get the name of the uploaded image
image_path = list(uploaded.keys())[0]
print("✅ Image uploaded and ready:", image_path)


✅ Image uploaded and ready: AdobeStock_1359903951_Preview.jpeg


In [19]:
caption = generate_caption(image_path)


In [20]:
# Retrieve relevant context from documents using caption + question
question = "What does this device do?"  # You can change the question
query = caption + " " + question
retrieved = retrieve_context(query)

print("📄 Retrieved Context:")
for i, ctx in enumerate(retrieved, 1):
    print(f"{i}. {ctx}")


📄 Retrieved Context:
1. This device is used to regulate gas flow in pipelines.
2. AI helps predict valve failure before it happens.
3. Positioners are used to control actuators by receiving feedback.


In [21]:
# Generate answer using Falcon-7B-Instruct (or the LLM you're using)
answer = generate_answer(question, caption, retrieved)

print("\n🖼️ Caption:", caption)
print("💬 Answer:", answer)


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.



🖼️ Caption: a drawing of a metal pipe with a valve on it

💬 Answer: Image: a drawing of a metal pipe with a valve on it

Context:
This device is used to regulate gas flow in pipelines.
AI helps predict valve failure before it happens.
Positioners are used to control actuators by receiving feedback.
Question: What does this device do?
Answer: This device is a valve. It is used to control the flow of gas in pipelines. The valve is closed when the pressure inside the pipe is higher than the pressure outside the pipe. When the pressure inside the pipe is lower than the pressure outside the pipe, the valve opens and gas can flow through it. The valve is also used to control the position of actuators in order to open and close the valve.
