In [35]:
import os
import glob
from PIL import Image

# Check if the dataset is already downloaded
if not os.path.exists("./Flickr8k_Dataset"):
    print("Dataset not found. Downloading...")
    # Download the Kaggle dataset
    !kaggle datasets download -d adityajn105/flickr8k -p ./Flickr8k_Dataset --unzip
else:
    print("Dataset already exists. Skipping download.")

# Path to Kaggle Flickr8k images
dataset_path = "./Flickr8k_Dataset/Images"

# Make sure the dataset path exists
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"Expected dataset path '{dataset_path}' not found.")

# Take 10 images
image_files = sorted(glob.glob(os.path.join(dataset_path, "*.jpg")))[:10]

# Print the selected files
print(f"Found {len(image_files)} images. Selected files:")
print(image_files)

Dataset already exists. Skipping download.
Found 10 images. Selected files:
['./Flickr8k_Dataset/Images/1000268201_693b08cb0e.jpg', './Flickr8k_Dataset/Images/1001773457_577c3a7d70.jpg', './Flickr8k_Dataset/Images/1002674143_1b742ab4b8.jpg', './Flickr8k_Dataset/Images/1003163366_44323f5815.jpg', './Flickr8k_Dataset/Images/1007129816_e794419615.jpg', './Flickr8k_Dataset/Images/1007320043_627395c3d8.jpg', './Flickr8k_Dataset/Images/1009434119_febe49276a.jpg', './Flickr8k_Dataset/Images/1012212859_01547e3f17.jpg', './Flickr8k_Dataset/Images/1015118661_980735411b.jpg', './Flickr8k_Dataset/Images/1015584366_dfcec3c85a.jpg']


In [36]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import os
import glob

# Load BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Using CPU
device = torch.device("cpu")
model.to(device)

# Path to Kaggle Flickr8k images
dataset_path = "./Flickr8k_Dataset/Images"

# Ensure the dataset path exists
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"Expected dataset path '{dataset_path}' not found.")

# Take exactly 10 images
image_files = sorted(glob.glob(os.path.join(dataset_path, "*.jpg")))[:10]

# Print the selected files
print(f"Found {len(image_files)} images. Selected files:")
print(image_files)

# Generate captions for the selected images
captions = []
for i, img_path in enumerate(image_files):  # Use image_files directly
    image = Image.open(img_path).convert("RGB")

    inputs = processor(images=image, return_tensors="pt").to(device)
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    captions.append(caption)
    print(f"Image {i} caption: {caption}")

Found 10 images. Selected files:
['./Flickr8k_Dataset/Images/1000268201_693b08cb0e.jpg', './Flickr8k_Dataset/Images/1001773457_577c3a7d70.jpg', './Flickr8k_Dataset/Images/1002674143_1b742ab4b8.jpg', './Flickr8k_Dataset/Images/1003163366_44323f5815.jpg', './Flickr8k_Dataset/Images/1007129816_e794419615.jpg', './Flickr8k_Dataset/Images/1007320043_627395c3d8.jpg', './Flickr8k_Dataset/Images/1009434119_febe49276a.jpg', './Flickr8k_Dataset/Images/1012212859_01547e3f17.jpg', './Flickr8k_Dataset/Images/1015118661_980735411b.jpg', './Flickr8k_Dataset/Images/1015584366_dfcec3c85a.jpg']
Image 0 caption: a little girl in a pink dress
Image 1 caption: two dogs playing on the road
Image 2 caption: a child sitting in the grass
Image 3 caption: a woman laying on a bench
Image 4 caption: man wearing a hat
Image 5 caption: a young girl climbing on a rope
Image 6 caption: a dog running in a field
Image 7 caption: a dog playing with a ball
Image 8 caption: a young boy is walking down the street
Image 9 c

In [37]:
'''
from transformers import CLIPProcessor, CLIPModel
import glob
import os
from PIL import Image

# Load CLIP
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
clip_model.to(device)

# Get the actual image files from your dataset
dataset_path = "./Flickr8k_Dataset/Images"
image_files = sorted(glob.glob(os.path.join(dataset_path, "*.jpg")))[:10]

# Verify we have images
if len(image_files) == 0:
    raise FileNotFoundError(f"No images found in {dataset_path}")

print(f"Processing {len(image_files)} images...")

clip_embeddings = []
for i, img_path in enumerate(image_files):
    print(f"Processing image {i+1}/10: {os.path.basename(img_path)}")
    image = Image.open(img_path).convert("RGB")
    
    inputs = clip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        emb = clip_model.get_image_features(**inputs)
    clip_embeddings.append(emb.cpu().numpy())
    print(f"Image {i} CLIP embedding shape: {emb.shape}")

print("CLIP embeddings generated successfully!")
'''

'\nfrom transformers import CLIPProcessor, CLIPModel\nimport glob\nimport os\nfrom PIL import Image\n\n# Load CLIP\nclip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")\nclip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")\nclip_model.to(device)\n\n# Get the actual image files from your dataset\ndataset_path = "./Flickr8k_Dataset/Images"\nimage_files = sorted(glob.glob(os.path.join(dataset_path, "*.jpg")))[:10]\n\n# Verify we have images\nif len(image_files) == 0:\n    raise FileNotFoundError(f"No images found in {dataset_path}")\n\nprint(f"Processing {len(image_files)} images...")\n\nclip_embeddings = []\nfor i, img_path in enumerate(image_files):\n    print(f"Processing image {i+1}/10: {os.path.basename(img_path)}")\n    image = Image.open(img_path).convert("RGB")\n\n    inputs = clip_processor(images=image, return_tensors="pt").to(device)\n    with torch.no_grad():\n        emb = clip_model.get_image_features(**inputs)\n    clip_embeddings

In [38]:
import cv2

# Load Haar cascade
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

haar_features = []
for i, img_path in enumerate(image_files):  # Use image_files directly
    img = cv2.imread(img_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.3, 5)
    haar_features.append(len(faces))
    print(f"Image {i} number of faces detected: {len(faces)}")

Image 0 number of faces detected: 0
Image 1 number of faces detected: 0
Image 2 number of faces detected: 0
Image 3 number of faces detected: 0
Image 4 number of faces detected: 0
Image 5 number of faces detected: 1
Image 6 number of faces detected: 0
Image 7 number of faces detected: 0
Image 8 number of faces detected: 1
Image 9 number of faces detected: 0


In [39]:
import pandas as pd

df = pd.DataFrame({
    "image_path": image_files,  # Use image_files directly
    "caption": captions,
    "num_faces": haar_features,
})

df.to_csv("friday_output.csv", index=False)
print("Saved CSV to friday_output.csv")

Saved CSV to friday_output.csv
