<a href="https://colab.research.google.com/github/LucioJuniorMachado/Trabalho_Redes_Neurais/blob/main/2_modelos_multimodais.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Zero-shot object detection**

# Instalando as bibliotecas necessárias

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git

In [None]:
!pip install Pillow
!pip install matplotlib
!pip install opencv-python
import cv2
import skimage
import numpy as np
from PIL import Image
import requests
from io import BytesIO
from IPython.display import display # Import display if not already imported in the first cell

In [None]:
from transformers.utils import send_example_telemetry

send_example_telemetry("IDEA-Research/grounding-dino-base", framework="pytorch")

# Carregando modelo pré-treinado e processador

In [None]:
from transformers import AutoProcessor, OwlViTForObjectDetection, OwlViTProcessor

# Increase the timeout for downloading the model files
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32") #, request_timeout=60.0)

processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")

In [None]:
from transformers import AutoProcessor, OwlViTForObjectDetection, OwlViTProcessor

# Increase the timeout for downloading the model files
# Explicitly specify the revision to try and resolve potential issues with the latest version
# Remove request_timeout and revision as they are not parameters for the model's __init__
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")

# The processor also doesn't need these arguments for its __init__
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")

# Fazendo o download da imagem

In [None]:
url = "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRVnXoO798Cv_gfLX_zxvO3Lm4_CaK32J51NQ&s"
image = Image.open(requests.get(url, stream=True).raw)

image_array = skimage.io.imread(url)
# Create a PIL Image object from the NumPy array
im = Image.fromarray(image_array)
# Use display to show the image in the notebook
display(im)

# Text queries to search the image for
text_queries = ["human face", "pipe", "chair", "books"]

In [None]:
import torch

# Use GPU if available
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [None]:
# Process image and text inputs
inputs = processor(text=text_queries, images=image, return_tensors="pt").to(device)

# Print input names and shapes
for key, val in inputs.items():
    print(f"{key}: {val.shape}")


In [None]:
# Set model in evaluation mode
model = model.to(device)
model.eval()

# Get predictions
with torch.no_grad():
  outputs = model(**inputs)

for k, val in outputs.items():
    if k not in {"text_model_output", "vision_model_output"}:
        print(f"{k}: shape of {val.shape}")

print("\nText model outputs")
for k, val in outputs.text_model_output.items():
    print(f"{k}: shape of {val.shape}")

print("\nVision model outputs")
for k, val in outputs.vision_model_output.items():
    print(f"{k}: shape of {val.shape}")

# Detectando a imagem, desenhando sua predição

In [None]:
import matplotlib.pyplot as plt

from transformers.image_utils import ImageFeatureExtractionMixin
mixin = ImageFeatureExtractionMixin()

# Load example image
image_size = model.config.vision_config.image_size
image = mixin.resize(image, image_size)
input_image = np.asarray(image).astype(np.float32) / 255.0

# Threshold to eliminate low probability predictions
score_threshold = 0.1

# Get prediction logits
logits = torch.max(outputs["logits"][0], dim=-1)
scores = torch.sigmoid(logits.values).cpu().detach().numpy()

# Get prediction labels and boundary boxes
labels = logits.indices.cpu().detach().numpy()
boxes = outputs["pred_boxes"][0].cpu().detach().numpy()

In [None]:
def plot_predictions(input_image, text_queries, scores, boxes, labels):
    fig, ax = plt.subplots(1, 1, figsize=(8, 8))
    ax.imshow(input_image, extent=(0, 1, 1, 0))
    ax.set_axis_off()

    for score, box, label in zip(scores, boxes, labels):
      if score < score_threshold:
        continue

      cx, cy, w, h = box
      ax.plot([cx-w/2, cx+w/2, cx+w/2, cx-w/2, cx-w/2],
              [cy-h/2, cy-h/2, cy+h/2, cy+h/2, cy-h/2], "r")
      ax.text(
          cx - w / 2,
          cy + h / 2 + 0.015,
          f"{text_queries[label]}: {score:1.2f}",
          ha="left",
          va="top",
          color="red",
          bbox={
              "facecolor": "white",
              "edgecolor": "red",
              "boxstyle": "square,pad=.3"
          })

plot_predictions(input_image, text_queries, scores, boxes, labels)

#**Zero-shot classification para classificação de imagem em diferentes categorias**

# Instalando as bibliotecas

In [None]:
! pip install -q "transformers[torch]" pillow
import cv2
import skimage
import numpy as np
from PIL import Image
import requests
from io import BytesIO
from IPython.display import display # Import display if not already imported in the first cell

# Instanciando a pipeline



In [None]:
from transformers import AutoProcessor, AutoModelForZeroShotImageClassification, pipeline # Import the pipeline function

# Define the checkpoint name for the zero-shot image classification model
checkpoint = "openai/clip-vit-base-patch32" # Replace with the desired model checkpoint

# Create the zero-shot image classification pipeline
detector = pipeline("zero-shot-image-classification", model=checkpoint) # Assign the pipeline to the variable 'detector'

model = AutoModelForZeroShotImageClassification.from_pretrained(checkpoint)
processor = AutoProcessor.from_pretrained(checkpoint)

# Importando mais bibliotecas e carregando a imagem

In [None]:
url = "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR2pzAJyx7WdXZpkBpz389qQCAFcX45_G1ccg&s"
image = Image.open(requests.get(url, stream=True).raw)

image_array = skimage.io.imread(url)
# Create a PIL Image object from the NumPy array
im = Image.fromarray(image_array)
# Use display to show the image in the notebook
display(im)

# Fazendo as predições por classes

In [None]:
predictions = detector(image, candidate_labels=["human being", "road", "car", "motocycle"])
predictions