<a href="https://colab.research.google.com/github/Kausarfatima186/CODSOFT/blob/main/Codsoft_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install tensorflow numpy matplotlib




In [3]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed
from tensorflow.keras.utils import to_categorical
import numpy as np
import string

# Function to preprocess the image
def preprocess_image(image_path):
    image = load_img(image_path, target_size=(224, 224))  # Load the image and resize to 224x224
    image = img_to_array(image)  # Convert to array
    image = preprocess_input(image)  # Preprocess for ResNet50
    return np.expand_dims(image, axis=0)  # Add batch dimension

# Function to extract features using ResNet50
def extract_features(image_path):
    resnet = ResNet50(weights="imagenet", include_top=False, pooling='avg')  # Load ResNet50 model
    image = preprocess_image(image_path)  # Preprocess the image
    features = resnet.predict(image)  # Extract features
    return features

# Path to the single image
image_path = "/content/cat.jpg"  # Replace this with the path to your image

# Single caption for the image
caption = "A cat sitting on a sofa."  # Replace this with your desired caption

# Vocabulary for tokenizing the caption
vocab = {"<start>": 0, "<end>": 1, "A": 2, "cat": 3, "sitting": 4, "on": 5, "a": 6, "sofa": 7}

# Preprocess the caption (remove punctuation and convert to tokens)
caption_cleaned = caption.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
caption_tokens = [vocab["<start>"]] + [vocab[word] for word in caption_cleaned.split()] + [vocab["<end>"]]
caption_tokens = np.array(caption_tokens)  # Convert to numpy array

# Extract features for the single image
image_features = extract_features(image_path)

# Define the RNN model
embedding_dim = 50
vocab_size = len(vocab)

model = Sequential([
    Dense(embedding_dim, input_shape=(image_features.shape[1],), activation='relu'),  # Embed image features
    RepeatVector(len(caption_tokens) - 1),  # Repeat the vector for each word in the caption
    LSTM(256, return_sequences=True),  # LSTM layer to generate sequences
    TimeDistributed(Dense(vocab_size, activation='softmax'))  # Output layer with vocab_size dimensions
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Prepare data for training
image_features = np.expand_dims(image_features[0], axis=0)  # Add batch dimension to image features
caption_input = caption_tokens[:-1].reshape((1, -1))  # Input caption (excluding last token)

# One-hot encode the output caption
caption_output = caption_tokens[1:].reshape((-1, 1))  # Output caption (excluding first token)
caption_output = to_categorical(caption_output, num_classes=vocab_size).reshape((1, len(caption_output), vocab_size))  # One-hot encode

# Train the model
model.fit([image_features, caption_input], caption_output, epochs=100, verbose=1)

# Function to generate a caption
def generate_caption(model, image_features, vocab):
    reverse_vocab = {v: k for k, v in vocab.items()}  # Reverse vocabulary for decoding
    generated = [vocab["<start>"]]  # Start token

    for _ in range(20):  # Max caption length
        predictions = model.predict([image_features, np.array([generated])])
        next_word = np.argmax(predictions[0, len(generated)-1, :])  # Get the word with the highest probability
        generated.append(next_word)
        if next_word == vocab["<end>"]:  # Stop if end token is generated
            break

    return " ".join([reverse_vocab[token] for token in generated[1:-1]])  # Convert tokens back to words

# Test the model with the same image
predicted_caption = generate_caption(model, image_features, vocab)
print("Generated Caption:", predicted_caption)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 2.0630
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 1.9419
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - loss: 1.8725
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - loss: 1.8169
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - loss: 1.7475
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 1.6763
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - loss: 1.6040
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 1.5253
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - loss: 1.4398
Epoch 10/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - loss: 1.3504
Epoch 11/100
[1m1/1[0