In [None]:
!pip install pdfplumber faiss-cpu transformers torchvision pillow


In [None]:
from google.colab import files
uploaded = files.upload()


Saving Res2Net Forgery detection approach.pdf to Res2Net Forgery detection approach (1).pdf


In [None]:
import pdfplumber
from PIL import Image
import os

os.makedirs("images", exist_ok=True)

pdf_path = "/content/Res2Net Forgery detection approach.pdf"
text_chunks = []
image_paths = []

with pdfplumber.open(pdf_path) as pdf:
    for i, page in enumerate(pdf.pages):

        text = page.extract_text()
        if text:
            text_chunks.append(text)


        page_image = page.to_image(resolution=150).original

        for j, img in enumerate(page.images):

            x0, top, x1, bottom = img['x0'], img['top'], img['x1'], img['bottom']


            height = page.height
            box = (x0, height - bottom, x1, height - top)

            cropped = page_image.crop(box)
            img_path = f"images/page_{i}_img_{j}.png"
            cropped.save(img_path)
            image_paths.append(img_path)

print(f"✅ Extracted {len(text_chunks)} text chunks and {len(image_paths)} images.")




✅ Extracted 5 text chunks and 1 images.


In [None]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch


clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


def embed_texts(texts):
    inputs = clip_processor(text=texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        return clip_model.get_text_features(**inputs)

def embed_images(paths):
    images = [Image.open(p).convert("RGB") for p in paths]
    inputs = clip_processor(images=images, return_tensors="pt", padding=True)
    with torch.no_grad():
        return clip_model.get_image_features(**inputs)


In [None]:
import faiss
import numpy as np

# Safely compute embeddings
text_embeddings = embed_texts(text_chunks).cpu().numpy() if text_chunks else np.zeros((0, 512))
image_embeddings = embed_images(image_paths).cpu().numpy() if image_paths else np.zeros((0, 512))


combined_embeddings = np.concatenate([text_embeddings, image_embeddings], axis=0)
combined_metadata = text_chunks + image_paths


if combined_embeddings.shape[0] > 0:
    index = faiss.IndexFlatL2(512)
    index.add(combined_embeddings)
    print(f"FAISS index created with {len(combined_metadata)} items.")
else:
    index = None
    print("No embeddings to index.")


FAISS index created with 6 items.


In [None]:
def search(query, top_k=3):
    if index is None:
        return ["⚠️ No indexed data"]

    query_embedding = embed_texts([query])[0].unsqueeze(0).cpu().numpy()
    D, I = index.search(query_embedding, top_k)
    return [combined_metadata[i] for i in I[0]]


results = search("spectrogram diagram")
print("🔍 Search Results:", results)


🔍 Search Results: ['Table 1. The overall model architectures of ResNet34, ResNet50, Res2Net50 and SE-Res2Net50. The type of a residual block and the\nnumberofchannelsarespecifiedinsidethebrackets,whiletherepeattimesofeachblockononestagearespecifiedoutsidethebrackets.“2-d\nfc”denotesafullyconnectedlayerwith2outputunits.\nStage ResNet34 ResNet50 Res2Net50 SE-Res2Net50\nconv2d,7×7,16,stride=2\nConv1 [conv2d,3×3,16,stride=1]×3\nmaxpool,3×3,stride=2\nConv2 [BasicBLK,16]×3 [BottleneckBLK,16]×3 [Res2NetBLK,16]×3 [SE-Res2NetBLK,16]×3\nConv3 [BasicBLK,32]×4 [BottleneckBLK,32]×4 [Res2NetBLK,32]×4 [SE-Res2NetBLK,32]×4\nConv4 [BasicBLK,64]×6 [BottleneckBLK,64]×6 [Res2NetBLK,64]×6 [SE-Res2NetBLK,64]×6\nConv5 [BasicBLK,128]×3 [BottleneckBLK,128]×3 [Res2NetBLK,128]×3 [SE-Res2NetBLK,128]×3\nglobalaveragepool,2-dfc,softmax\nThe contributions of this work include: 1) Leveraging the\nRes2Netmodelarchitectureintoanti-spoofingandverifyingitssig-\nnificant improvements over ResNet34 and ResNet50 models; 2)\

In [None]:
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration

blip_model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
blip_processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

def answer_with_image(image_path, question):
    image = Image.open(image_path).convert("RGB")
    inputs = blip_processor(images=image, text=question, return_tensors="pt").to("cuda")
    outputs = blip_model.generate(**inputs)
    return blip_processor.batch_decode(outputs, skip_special_tokens=True)[0]


for item in results:
    if item.endswith(".png"):
        print("🤖 Answer:", answer_with_image(item, "What does this image show?"))


config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/104k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/9.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]



preprocessor_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.62M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/549 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/833 [00:00<?, ?B/s]

In [None]:
def pretty_print_text(text, file):
    # Replace escaped newlines with real newlines if any, then split lines and write to file
    text = text.replace('\\n', '\n').replace('\n\n', '\n')
    lines = text.split('\n')
    for line in lines:
        file.write(line.strip() + '\n')
    file.write('\n')  # extra blank line after each chunk

results = search("spectrogram diagram")

with open("answer.txt", "w", encoding="utf-8") as f:
    f.write("🔍 Search Results:\n\n")
    for item in results:
        if item.endswith(".png"):
            answer = answer_with_image(item, "What does this image show?")
            f.write(f"🖼️ Image: {item}\n🤖 Answer:\n{answer}\n{'-'*40}\n\n")
        else:
            f.write("📄 Text Result:\n")
            pretty_print_text(item, f)
            f.write('-' * 40 + '\n\n')
