In [None]:
from datasets import load_dataset
dataset = load_dataset("jxie/flickr8k")


In [None]:
!mv ./flickr8k_images/Flicker8k_Dataset/* ./flickr8k_images/


In [None]:
!rm -r ./flickr8k_images/Flicker8k_Dataset


In [None]:
import os
print("Number of images:", len(os.listdir('./flickr8k_images')))
print("First 10 images:", os.listdir('./flickr8k_images')[:10])


In [None]:
captions_file = './flickr8k_text/Flickr8k.token.txt'

image_captions = {}
with open(captions_file, 'r') as f:
    for line in f:
        line = line.strip()
        if len(line) == 0:
            continue
        img_cap, caption = line.split('\t')
        img_name = img_cap.split('#')[0]
        if img_name not in image_captions:
            image_captions[img_name] = []
        image_captions[img_name].append(caption)

list(image_captions.items())[:2]


In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_caption(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join([w for w in text.split() if w not in stop_words])
    return text

for img, caps in list(image_captions.items())[:5]:
    image_captions[img] = [preprocess_caption(c) for c in caps]
    print(img, image_captions[img])


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

all_captions = [' '.join(caps) for caps in image_captions.values()]


vectorizer = TfidfVectorizer(max_features=1000)
X_text = vectorizer.fit_transform(all_captions).toarray()

print("TF-IDF feature shape:", X_text.shape)


In [None]:
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
import numpy as np

model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

def get_image_features(img_path):
    img = image.load_img(img_path, target_size=(224,224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    features = model.predict(x, verbose=0)
    return features.flatten()

image_files = list(image_captions.keys())[:100]
X_image = np.array([get_image_features('./flickr8k_images/' + img) for img in image_files])
print("Image feature shape:", X_image.shape)


In [None]:

X_text_subset = X_text[:len(image_files)]

X_combined = np.hstack((X_text_subset, X_image))
print("Combined feature shape:", X_combined.shape)


In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

pca = PCA(n_components=50)
X_reduced = pca.fit_transform(X_combined)

kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X_reduced)

print("Cluster labels for first 10 images:", clusters[:10])


In [None]:
import matplotlib.pyplot as plt
import random
from PIL import Image

cluster_to_images = {}
for img, label in zip(image_files, clusters):
    if label not in cluster_to_images:
        cluster_to_images[label] = []
    cluster_to_images[label].append(img)

for cluster_label, imgs in cluster_to_images.items():
    print(f"\nCluster {cluster_label}:")
    sample_imgs = random.sample(imgs, min(5, len(imgs)))
    plt.figure(figsize=(15,3))
    for i, img_name in enumerate(sample_imgs):
        img_path = './flickr8k_images/' + img_name
        img = Image.open(img_path)
        plt.subplot(1,5,i+1)
        plt.imshow(img)
        plt.axis('off')
        plt.title('\n'.join(image_captions[img_name][:2]), fontsize=8)
    plt.show()


In [None]:
import matplotlib.pyplot as plt
from PIL import Image
import math

images_per_cluster = 5
cluster_labels = sorted(set(clusters))

plt.figure(figsize=(20, len(cluster_labels)*4))

for idx, cluster_label in enumerate(cluster_labels):

    imgs_in_cluster = [img for img, lbl in zip(image_files, clusters) if lbl == cluster_label]

    sample_imgs = imgs_in_cluster[:images_per_cluster]

    for i, img_name in enumerate(sample_imgs):
        plt_idx = idx*images_per_cluster + i + 1
        plt.subplot(len(cluster_labels), images_per_cluster, plt_idx)
        img_path = './flickr8k_images/' + img_name
        img = Image.open(img_path)
        plt.imshow(img)
        plt.axis('off')
        plt.title('\n'.join(image_captions[img_name][:2]), fontsize=8)

plt.suptitle("Flickr8k Image Clusters", fontsize=20)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

pca_2d = PCA(n_components=2)
X_2d = pca_2d.fit_transform(X_combined)

plt.figure(figsize=(10,8))
for cluster_label in sorted(set(clusters)):
    idxs = [i for i, lbl in enumerate(clusters) if lbl == cluster_label]
    plt.scatter(X_2d[idxs, 0], X_2d[idxs, 1], label=f'Cluster {cluster_label}', alpha=0.6)

plt.title("Scatter Plot of Image + Caption Clusters (2D)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.show()
