In [4]:
import os, re, json, pickle
from tqdm import tqdm
import numpy as np


In [1]:
from PIL import Image
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input as inception_preprocess
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input as resnet_preprocess

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


ImportError: Traceback (most recent call last):
  File "C:\Users\uamaha01\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.

In [None]:
# cell: choose encoder
encoder_name = 'inceptionv3'   # change to 'resnet50' if you prefer

if encoder_name.lower() == 'inceptionv3':
    cnn_model = InceptionV3(weights='imagenet', include_top=False, pooling='avg')
    preprocess_fn = inception_preprocess
    target_size = (299, 299)
elif encoder_name.lower() == 'resnet50':
    cnn_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
    preprocess_fn = resnet_preprocess
    target_size = (224, 224)
else:
    raise ValueError("encoder_name must be 'inceptionv3' or 'resnet50'")

# This model returns a fixed-size vector per image (e.g. 2048)
print("Encoder ready. Output shape:", cnn_model.output_shape)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m87910968/87910968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 0us/step
Encoder ready. Output shape: (None, 2048)


In [None]:
# cell: test feature extraction on one image
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import numpy as np
# ...existing code...
sample_img_path = r"C:\Users\uamaha01\Downloads\AI project\Images\99171998_7cc800ceef.jpg"  # <- change to an actual path
# ...existing code... 

img = load_img(sample_img_path, target_size=target_size)
x  = img_to_array(img)
x  = np.expand_dims(x, axis=0)
x  = preprocess_fn(x)            # model-specific preprocessing
feat = cnn_model.predict(x, verbose=0)
print("feature shape:", feat.shape)   # expect (1, 2048) or (1, 512) depending on model


feature shape: (1, 2048)


In [None]:
def extract_features_from_dir(image_dir, out_file='features_inception.npz'):
    files = [f for f in os.listdir(image_dir) if f.lower().endswith(('.jpg','.jpeg','.png'))]
    files.sort()
    features = {}
    for fname in tqdm(files, desc="Extracting"):
        img_id = os.path.splitext(fname)[0]
        p = os.path.join(image_dir, fname)
        img = load_img(p, target_size=target_size)
        x = img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_fn(x)
        feat = cnn_model.predict(x, verbose=0).squeeze()
        features[img_id] = feat
    # save compressed
    np.savez_compressed(out_file, **features)
    print("Saved features to:", out_file)

In [None]:
# cell: load captions (adapt path/format to your captions file)
import re, string

def clean_caption_text(txt):
    txt = txt.lower()
    txt = re.sub(r"[^a-z0-9\s]", "", txt)   # keep only alphanumerics + spaces
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

def load_captions_file(captions_file):
    descriptions = {}
    with open(captions_file, 'r', encoding='utf-8') as f:
        lines = f.read().strip().split('\n')
    for line in lines:
        if not line: 
            continue
        # supports common formats: "img.jpg,caption..." or "img.jpg\tcaption..."
        if '\t' in line:
            img_part, caption = line.split('\t',1)
        else:
            parts = line.split(',', 1)
            if len(parts) < 2:
                continue
            img_part, caption = parts
        img_id = os.path.basename(img_part).split('.')[0]
        caption = clean_caption_text(caption)
        caption = "startseq " + caption + " endseq"
        descriptions.setdefault(img_id, []).append(caption)
    return descriptions

# Usage
# descriptions = load_captions_file("/path/to/Flickr8k/captions.txt")
# print("Example:", list(descriptions.items())[:1])


In [None]:
# cell: tokenizer + stats
from tensorflow.keras.preprocessing.text import Tokenizer

def create_tokenizer(descriptions, num_words=None, oov_token='unk'):
    all_caps = [c for caps in descriptions.values() for c in caps]
    tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
    tokenizer.fit_on_texts(all_caps)
    return tokenizer

def get_max_length(descriptions):
    all_caps = [c for caps in descriptions.values() for c in caps]
    return max(len(c.split()) for c in all_caps)

# Example usage (after descriptions built)
# tokenizer = create_tokenizer(descriptions, num_words=10000)
# vocab_size = min(len(tokenizer.word_index) + 1, 10000)  # if num_words set, cap vocab
# max_length = get_max_length(descriptions)
# print("Vocab size:", vocab_size)
# print("Max caption length:", max_length)


In [None]:
captions_file = r"C:\Users\uamaha01\Downloads\AI project\captions.txt\captions.txt"  # update path if needed
descriptions = load_captions_file(captions_file)
print("Loaded captions for", len(descriptions), "images")

Loaded captions for 8092 images


In [None]:
# Extract features from all images in the folder and save to .npz
extract_features_from_dir(
    image_dir=r"C:\Users\uamaha01\Downloads\AI project\Images",
    out_file=r"C:\Users\uamaha01\Downloads\AI project\flickr8k_inception_features.npz"
)

# Load the features file
data = np.load(r"C:\Users\uamaha01\Downloads\AI project\flickr8k_inception_features.npz", allow_pickle=True)
print("sample feature keys:", data.files[:5])

# Print a sample caption (make sure 'descriptions' is loaded)
sample_id = data.files[0]
print("sample captions:", descriptions.get(sample_id, [])[:3])

Extracting:   0%|          | 0/8091 [00:00<?, ?it/s]

Extracting: 100%|██████████| 8091/8091 [23:37<00:00,  5.71it/s]


Saved features to: C:\Users\uamaha01\Downloads\AI project\flickr8k_inception_features.npz
sample feature keys: ['1000268201_693b08cb0e', '1001773457_577c3a7d70', '1002674143_1b742ab4b8', '1003163366_44323f5815', '1007129816_e794419615']
sample captions: ['startseq a child in a pink dress is climbing up a set of stairs in an entry way endseq', 'startseq a girl going into a wooden building endseq', 'startseq a little girl climbing into a wooden playhouse endseq']


after that need to be confirmd

In [None]:
# Create tokenizer and get vocabulary size
tokenizer = create_tokenizer(descriptions, num_words=10000)
vocab_size = min(len(tokenizer.word_index) + 1, 10000)
max_length = get_max_length(descriptions)
print("Vocab size:", vocab_size)
print("Max caption length:", max_length)

Vocab size: 8833
Max caption length: 38


In [None]:
# Save tokenizer and max_length for Streamlit app
import pickle

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('max_length.txt', 'w') as f:
    f.write(str(max_length))

print("Saved tokenizer as tokenizer.pkl")
print("Saved max_length as max_length.txt")

Saved tokenizer as tokenizer.pkl
Saved max_length as max_length.txt


In [None]:
from tensorflow.keras.utils import to_categorical

def create_sequences(tokenizer, max_length, descriptions, features, vocab_size):
    X1, X2, y = [], [], []
    for img_id, caps in descriptions.items():
        feature = features[img_id]
        for cap in caps:
            seq = tokenizer.texts_to_sequences([cap])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(feature)
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

# Usage example:
# features = dict(np.load(r"C:\Users\uamaha01\Downloads\AI project\flickr8k_inception_features.npz", allow_pickle=True))
# X1, X2, y = create_sequences(tokenizer, max_length, descriptions, features, vocab_size)
# print(X1.shape, X2.shape, y.shape)

In [None]:
# Prepare training data (skip missing features)
def create_sequences(tokenizer, max_length, descriptions, features, vocab_size):
    X1, X2, y = [], [], []
    for img_id, caps in descriptions.items():
        if img_id not in features:
            continue  # skip if image feature is missing
        feature = features[img_id]
        for cap in caps:
            seq = tokenizer.texts_to_sequences([cap])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(feature)
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

# Load features as a dictionary
features = dict(np.load(r"C:\Users\uamaha01\Downloads\AI project\flickr8k_inception_features.npz", allow_pickle=True))

# Prepare training data
X1, X2, y = create_sequences(tokenizer, max_length, descriptions, features, vocab_size)

print("Image features shape:", X1.shape)
print("Input sequence shape:", X2.shape)
print("Output (y) shape:", y.shape)

Image features shape: (476960, 2048)
Input sequence shape: (476960, 38)
Output (y) shape: (476960, 8833)


In [None]:
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, add
from tensorflow.keras.models import Model

# Image feature extractor model
inputs1 = Input(shape=(X1.shape[1],))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

# Sequence model
inputs2 = Input(shape=(X2.shape[1],))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

# Decoder (combine)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

model.summary()

In [None]:
# Train the model (this may take a long time depending on your data and hardware)
model.fit([X1, X2], y, epochs=20, batch_size=256, verbose=1)

Epoch 1/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m548s[0m 291ms/step - loss: 3.8248
Epoch 2/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m542s[0m 291ms/step - loss: 3.0763
Epoch 3/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m577s[0m 299ms/step - loss: 2.8230
Epoch 4/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m517s[0m 277ms/step - loss: 2.6590
Epoch 5/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m573s[0m 307ms/step - loss: 2.5326
Epoch 6/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m593s[0m 318ms/step - loss: 2.4352
Epoch 7/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m705s[0m 378ms/step - loss: 2.3551
Epoch 8/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m697s[0m 374ms/step - loss: 2.2910
Epoch 9/20
[1m1864/1864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m646s[0m 346ms/step - loss: 2.2382
Epoch 10/20
[1m1864/1864[0m [32m━━

<keras.src.callbacks.history.History at 0x1d863bc7250>

In [None]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.9.1-cp311-cp311-win_amd64.whl.metadata (41 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -------------------- ------------------- 0.8/1.5 MB 4.2 MB/s eta 0:00:01
   ---------------------------------- ----- 1.3/1.5 MB 3.4 MB/s eta 0:00:01
   ---------------------------------- ----- 1.3/1.5 MB 3.4 MB/s eta 0:00:01
   ---------------------------------- ----- 1.3/1.5 MB 3.4 MB/s eta 0:00:01
   ---------------------------------- ----- 1.3/1.5 MB 3.4 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 1.1 MB/s  0:00:01
Downloading regex-2025.9.1-cp311-cp311-win_amd64.whl (276 kB)
Installing collected packages: regex, nltk

   ---------------------------------------- 0/2 [regex]
   ---------------------------------------- 0/2 [regex]
   -------------------- 

In [None]:
from nltk.translate.bleu_score import corpus_bleu

def generate_caption(model, tokenizer, photo_feature, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo_feature, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = None
        for w, idx in tokenizer.word_index.items():
            if idx == yhat:
                word = w
                break
        if word is None or word == 'endseq':
            break
        in_text += ' ' + word
    return in_text.replace('startseq', '').replace('endseq', '').strip()

# Prepare references and predictions
actual, predicted = [], []
for img_id in descriptions.keys():
    if img_id not in features:
        continue
    photo_feature = features[img_id]
    if len(photo_feature.shape) == 1:
        photo_feature = photo_feature.reshape((1, -1))
    y_pred = generate_caption(model, tokenizer, photo_feature, max_length)
    references = [cap.split() for cap in descriptions[img_id]]
    actual.append(references)
    predicted.append(y_pred.split())

# Calculate BLEU scores
print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.33, 0.33, 0.33, 0)))
print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

BLEU-1: 0.545751
BLEU-2: 0.374499
BLEU-3: 0.255068
BLEU-4: 0.169695


In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Caption generation function
def generate_caption(model, tokenizer, photo_feature, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo_feature, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = None
        for w, idx in tokenizer.word_index.items():
            if idx == yhat:
                word = w
                break
        if word is None or word == 'endseq':
            break
        in_text += ' ' + word
    return in_text.replace('startseq', '').replace('endseq', '').strip()

# --- EVALUATION ON A SMALL SUBSET FOR SPEED ---
import time
start = time.time()

actual, predicted = [], []
img_ids = list(descriptions.keys())[:50]  # Evaluate on first 50 images for speed

for idx, img_id in enumerate(img_ids):
    if img_id not in features:
        continue
    photo_feature = features[img_id]
    if len(photo_feature.shape) == 1:
        photo_feature = photo_feature.reshape((1, -1))
    y_pred = generate_caption(model, tokenizer, photo_feature, max_length)
    references = [cap.split() for cap in descriptions[img_id]]
    actual.append(references)
    predicted.append(y_pred.split())
    if (idx+1) % 10 == 0:
        print(f"Evaluated {idx+1}/{len(img_ids)} images...")

print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.33, 0.33, 0.33, 0)))
print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
print("Evaluation time (seconds):", time.time() - start)

# --- GENERATE CAPTION FOR A NEW IMAGE ---
# Example: change the path to your test image
img_path = r"C:\Users\uamaha01\Downloads\AI project\activities-for-younger-kids_narrow.jpg"
img = load_img(img_path, target_size=target_size)
x = img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_fn(x)
photo_feature = cnn_model.predict(x, verbose=0)
caption = generate_caption(model, tokenizer, photo_feature, max_length)
print("Generated caption for your image:", caption)

# --- SAVE YOUR MODEL ---
model.save("caption_model.h5")
print("Model saved as caption_model.h5")

Evaluated 10/50 images...
Evaluated 20/50 images...
Evaluated 30/50 images...
Evaluated 40/50 images...
Evaluated 50/50 images...
BLEU-1: 0.535276
BLEU-2: 0.375026
BLEU-3: 0.250480
BLEU-4: 0.147616
Evaluation time (seconds): 67.23022937774658




Generated caption for your image: a boy in a blue shirt kicking a soccer ball
Model saved as caption_model.h5


In [None]:
model.save(r"C:\Users\uamaha01\Downloads\AI project\caption_model.h5")

NameError: name 'model' is not defined