In [None]:
!pip install nltk

In [1]:
import os
import json
import math
import functools, operator
import joblib
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

# For BLEU/METEOR scoring if needed
import nltk
nltk.download('punkt')  # ensure necessary nltk data is downloaded

# -------------------------------
# Configuration / Hyperparameters (adjust as needed)
# -------------------------------
latent_dim = 256          # base latent dim (the encoder is bidirectional so output dim is 2*latent_dim)
num_encoder_tokens = 2560
num_decoder_tokens = 1500
time_steps_encoder = 20
max_probability = -1      # for beam search tracking (if used)
search_type = 'greedy'    # or 'beam'
save_model_path = os.path.join(os.getcwd(), "model_final_2")
test_path = os.path.join(os.getcwd(), "testing_data")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# -------------------------------
# Define the BiLSTM Encoder for Inference
# -------------------------------
class BiLSTMEncoder(nn.Module):
    def __init__(self, input_size, latent_dim, num_layers=1):
        super(BiLSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(input_size, latent_dim, num_layers=num_layers,
                            batch_first=True, bidirectional=True)
        
    def forward(self, x):
        # x: (batch, time_steps, input_size)
        outputs, (h, c) = self.lstm(x)
        # For a bidirectional LSTM with one layer, h has shape (2, batch, latent_dim)
        # We'll concatenate the forward and backward states along the feature dimension.
        h_cat = torch.cat((h[0], h[1]), dim=-1)  # shape: (batch, 2*latent_dim)
        c_cat = torch.cat((c[0], c[1]), dim=-1)
        # Add a layer dimension (simulate num_layers=1 for decoder initial state)
        return h_cat.unsqueeze(0), c_cat.unsqueeze(0)

# -------------------------------
# Define the Decoder LSTM for Inference
# -------------------------------
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, dropout=0.5):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x, hidden, cell):
        # x: (batch, seq_len) as token indices
        embedded = self.embedding(x)  # (batch, seq_len, embed_size)
        outputs, (h, c) = self.lstm(embedded, (hidden, cell))
        logits = self.fc(outputs)  # (batch, seq_len, vocab_size)
        return logits, h, c

# -------------------------------
# Inference Wrapper Class
# -------------------------------
class VideoDescriptionInference(object):
    def __init__(self, latent_dim, num_encoder_tokens, num_decoder_tokens,
                 time_steps_encoder, max_probability, save_model_path, test_path, search_type):
        self.latent_dim = latent_dim
        self.num_encoder_tokens = num_encoder_tokens
        self.num_decoder_tokens = num_decoder_tokens
        self.time_steps_encoder = time_steps_encoder
        self.max_probability = max_probability
        self.save_model_path = save_model_path
        self.test_path = test_path
        self.search_type = search_type

        self.inf_encoder_model = None
        self.inf_decoder_model = None
        self.tokenizer = None

    def load_inference_models(self):
        # Load tokenizer (assumed saved with joblib)
        tokenizer_path = os.path.join(self.save_model_path, "tokenizer1500")
        with open(tokenizer_path, 'rb') as file:
            self.tokenizer = joblib.load(file)
        
        # Create model instances with matching architecture
        self.inf_encoder_model = BiLSTMEncoder(self.num_encoder_tokens, self.latent_dim).to(device)
        self.inf_decoder_model = Decoder(self.num_decoder_tokens, embed_size=256,
                                         hidden_size=self.latent_dim*2).to(device)
        # Load weights
        encoder_weights = torch.load(os.path.join(self.save_model_path, "encoder_model_LSTM_LSTM.pth"), map_location=device)
        decoder_weights = torch.load(os.path.join(self.save_model_path, "decoder_model_LSTM_LSTM.pth"), map_location=device)
        self.inf_encoder_model.load_state_dict(encoder_weights)
        self.inf_decoder_model.load_state_dict(decoder_weights)
        self.inf_encoder_model.eval()
        self.inf_decoder_model.eval()
        print("Inference models loaded.")

    def index_to_word(self):
        # Build inverse mapping from tokenizer word_index (assumed to be a dict)
        inv_map = {v: k for k, v in self.tokenizer.word_index.items()}
        # Also add special tokens if missing:
        inv_map[1] = 'bos'
        inv_map[2] = 'eos'
        return inv_map

    def greedy_search(self, feature):
        """
        Given a feature array for a video (or scene), generate a caption via greedy search.
        feature: numpy array of shape (time_steps_encoder, num_encoder_tokens)
        """
        # Prepare input tensor (batch_size=1)
        f_tensor = torch.tensor(feature, dtype=torch.float).unsqueeze(0).to(device)  # shape (1, T, D)
        with torch.no_grad():
            hidden, cell = self.inf_encoder_model(f_tensor)  # each shape (1,1, latent_dim*2)
        
        # Initialize decoder input with BOS token
        bos_index = self.tokenizer.word_index.get('bos', 1)
        target_seq = torch.tensor([[bos_index]], dtype=torch.long).to(device)  # shape (1, 1)
        inv_map = self.index_to_word()
        sentence = ""
        max_len = 15
        for i in range(max_len):
            with torch.no_grad():
                output, hidden, cell = self.inf_decoder_model(target_seq, hidden, cell)
            # output: shape (1, 1, vocab_size)
            logits = output.squeeze(1)  # shape (1, vocab_size)
            y_hat = logits.argmax(dim=-1).item()
            if y_hat == 0:
                continue
            if inv_map.get(y_hat, None) is None or inv_map[y_hat] == 'eos':
                break
            else:
                sentence += inv_map[y_hat] + " "
                # Prepare next input as the predicted token (unsqueeze to shape (1,1))
                target_seq = torch.tensor([[y_hat]], dtype=torch.long).to(device)
        return sentence.strip()

    # For brevity, we include a placeholder for beam_search (you can extend as needed)
    def beam_search(self, feature):
        # This is a placeholder implementation.
        # You would implement beam search similar to greedy_search but keep multiple hypotheses.
        return self.greedy_search(feature)

    def get_test_data(self):
        """
        Loads test video features. For each video file in test_path/video,
        collects the corresponding .npy files from test_path/feat_test.
        Returns:
          - A list of lists of features (one list per video; each element is a numpy array)
          - A list of corresponding video IDs.
        """
        X_test = []
        X_test_ids = []
        video_files = os.listdir(os.path.join(self.test_path, 'video'))
        for filename in video_files:
            video_id = os.path.splitext(filename)[0]
            featList = []
            # Assume scene features are saved as video_id.npy, video_id_2.npy, etc.
            # Here we attempt to load them until a file is not found.
            scene_idx = 1
            while True:
                if scene_idx == 1:
                    path = os.path.join(self.test_path, 'feat_test', f"{video_id}.npy")
                else:
                    path = os.path.join(self.test_path, 'feat_test', f"{video_id}_{scene_idx}.npy")
                if os.path.exists(path):
                    feat = np.load(path)
                    featList.append(feat)
                    scene_idx += 1
                else:
                    break
            if featList:
                X_test.append(featList)
                X_test_ids.append(video_id)
        return X_test, X_test_ids

    def test(self):
        """
        For each test video, runs inference (using greedy or beam search)
        on each scene and writes the results to a text file.
        """
        X_test, X_test_ids = self.get_test_data()
        output_path = os.path.join(self.test_path, "test_predictions.txt")
        with open(output_path, 'w') as out_file:
            for i, video_id in enumerate(X_test_ids):
                captions = []
                for feat in X_test[i]:
                    if self.search_type == 'greedy':
                        caption = self.greedy_search(feat.reshape(self.time_steps_encoder, self.num_encoder_tokens))
                    else:
                        caption = self.beam_search(feat.reshape(self.time_steps_encoder, self.num_encoder_tokens))
                    captions.append(caption)
                # Write one line per video: video_id, [caption1, caption2, ...]
                out_file.write(f"{video_id},{captions}\n")
        print(f"Predictions saved to {output_path}")

if __name__ == "__main__":
    # Create the inference object using the configuration parameters
    inference = VideoDescriptionInference(latent_dim, num_encoder_tokens, num_decoder_tokens,
                                          time_steps_encoder, max_probability, save_model_path, test_path, search_type)
    inference.load_inference_models()
    inference.test()
    
    # Optionally, read and print the predictions:
    test_result = {}
    pred_file = os.path.join(test_path, "test_predictions.txt")
    with open(pred_file, "r") as f:
        for line in f.readlines():
            parts = line.strip().split(",")
            if len(parts) >= 2:
                vid = parts[0]
                pred = ",".join(parts[1:])
                test_result[vid] = pred
    print(test_result)


ModuleNotFoundError: No module named 'nltk'