# Install ENVs.

In [None]:
!git clone https://github.com/alanjeremiah/WLASL-Recognition-and-Translation.git
!pip install gradio
!pip install opencv-python-headless
!pip install fer
!pip install python-dotenv
!pip install keytotext
!pip install boto3

Cloning into 'WLASL-Recognition-and-Translation'...
remote: Enumerating objects: 108, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 108 (delta 1), reused 0 (delta 0), pack-reused 98[K
Receiving objects: 100% (108/108), 366.47 MiB | 25.94 MiB/s, done.
Resolving deltas: 100% (20/20), done.
Updating files: 100% (48/48), done.
Collecting gradio
  Downloading gradio-4.39.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.1-py3-none-any.whl.metadata (26 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==1.1.1 (from gradio)
  Downloading gradio_client-1.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3

Collecting keytotext
  Downloading keytotext-2.3.2-py3-none-any.whl.metadata (6.4 kB)
Collecting wandb (from keytotext)
  Downloading wandb-0.17.5-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting pytorch_lightning (from keytotext)
  Downloading pytorch_lightning-2.3.3-py3-none-any.whl.metadata (21 kB)
Collecting datasets (from keytotext)
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting keybert (from keytotext)
  Downloading keybert-0.8.5-py3-none-any.whl.metadata (15 kB)
Collecting pyarrow>=15.0.0 (from datasets->keytotext)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->keytotext)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets->keytotext)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets->keytotext)
  Do

# Pytorch_I3D

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import os
import sys
from collections import OrderedDict

class MaxPool3dSamePadding(nn.MaxPool3d):
    def compute_pad(self, dim, s):
        if s % self.stride[dim] == 0:
            return max(self.kernel_size[dim] - self.stride[dim], 0)
        else:
            return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)

    def forward(self, x):
        (batch, channel, t, h, w) = x.size()
        out_t = np.ceil(float(t) / float(self.stride[0]))
        out_h = np.ceil(float(h) / float(self.stride[1]))
        out_w = np.ceil(float(w) / float(self.stride[2]))
        pad_t = self.compute_pad(0, t)
        pad_h = self.compute_pad(1, h)
        pad_w = self.compute_pad(2, w)
        pad_t_f = pad_t // 2
        pad_t_b = pad_t - pad_t_f
        pad_h_f = pad_h // 2
        pad_h_b = pad_h - pad_h_f
        pad_w_f = pad_w // 2
        pad_w_b = pad_w - pad_w_f
        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
        x = F.pad(x, pad)
        return super(MaxPool3dSamePadding, self).forward(x)

class Unit3D(nn.Module):
    def __init__(self, in_channels,
                 output_channels,
                 kernel_shape=(1, 1, 1),
                 stride=(1, 1, 1),
                 padding=0,
                 activation_fn=F.relu,
                 use_batch_norm=True,
                 use_bias=False,
                 name='unit_3d'):
        super(Unit3D, self).__init__()
        self._output_channels = output_channels
        self._kernel_shape = kernel_shape
        self._stride = stride
        self._use_batch_norm = use_batch_norm
        self._activation_fn = activation_fn
        self._use_bias = use_bias
        self.name = name
        self.padding = padding
        self.conv3d = nn.Conv3d(in_channels=in_channels,
                                out_channels=self._output_channels,
                                kernel_size=self._kernel_shape,
                                stride=self._stride,
                                padding=0,
                                bias=self._use_bias)
        if self._use_batch_norm:
            self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)

    def compute_pad(self, dim, s):
        if s % self._stride[dim] == 0:
            return max(self._kernel_shape[dim] - self._stride[dim], 0)
        else:
            return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)

    def forward(self, x):
        (batch, channel, t, h, w) = x.size()
        out_t = np.ceil(float(t) / float(self._stride[0]))
        out_h = np.ceil(float(h) / float(self._stride[1]))
        out_w = np.ceil(float(w) / float(self._stride[2]))
        pad_t = self.compute_pad(0, t)
        pad_h = self.compute_pad(1, h)
        pad_w = self.compute_pad(2, w)
        pad_t_f = pad_t // 2
        pad_t_b = pad_t - pad_t_f
        pad_h_f = pad_h // 2
        pad_h_b = pad_h - pad_h_f
        pad_w_f = pad_w // 2
        pad_w_b = pad_w - pad_w_f
        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
        x = F.pad(x, pad)
        x = self.conv3d(x)
        if self._use_batch_norm:
            x = self.bn(x)
        if self._activation_fn is not None:
            x = self._activation_fn(x)
        return x

class InceptionModule(nn.Module):
    def __init__(self, in_channels, out_channels, name):
        super(InceptionModule, self).__init__()
        self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0,
                         name=name+'/Branch_0/Conv3d_0a_1x1')
        self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0,
                          name=name+'/Branch_1/Conv3d_0a_1x1')
        self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3],
                          name=name+'/Branch_1/Conv3d_0b_3x3')
        self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0,
                          name=name+'/Branch_2/Conv3d_0a_1x1')
        self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3],
                          name=name+'/Branch_2/Conv3d_0b_3x3')
        self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3],
                                stride=(1, 1, 1), padding=0)
        self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0,
                          name=name+'/Branch_3/Conv3d_0b_1x1')
        self.name = name

    def forward(self, x):
        b0 = self.b0(x)
        b1 = self.b1b(self.b1a(x))
        b2 = self.b2b(self.b2a(x))
        b3 = self.b3b(self.b3a(x))
        return torch.cat([b0,b1,b2,b3], dim=1)

class InceptionI3d(nn.Module):
    VALID_ENDPOINTS = (
        'Conv3d_1a_7x7',
        'MaxPool3d_2a_3x3',
        'Conv3d_2b_1x1',
        'Conv3d_2c_3x3',
        'MaxPool3d_3a_3x3',
        'Mixed_3b',
        'Mixed_3c',
        'MaxPool3d_4a_3x3',
        'Mixed_4b',
        'Mixed_4c',
        'Mixed_4d',
        'Mixed_4e',
        'Mixed_4f',
        'MaxPool3d_5a_2x2',
        'Mixed_5b',
        'Mixed_5c',
        'Logits',
        'Predictions',
    )

    def __init__(self, num_classes=400, spatial_squeeze=True,
                 final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5):
        if final_endpoint not in self.VALID_ENDPOINTS:
            raise ValueError('Unknown final endpoint %s' % final_endpoint)

        super(InceptionI3d, self).__init__()
        self._num_classes = num_classes
        self._spatial_squeeze = spatial_squeeze
        self._final_endpoint = final_endpoint
        self.logits = None

        if self._final_endpoint not in self.VALID_ENDPOINTS:
            raise ValueError('Unknown final endpoint %s' % self._final_endpoint)

        self.end_points = {}
        end_point = 'Conv3d_1a_7x7'
        self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7],
                                            stride=(2, 2, 2), padding=(3,3,3),  name=name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'MaxPool3d_2a_3x3'
        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
                                                             padding=0)
        if self._final_endpoint == end_point: return

        end_point = 'Conv3d_2b_1x1'
        self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0,
                                       name=name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Conv3d_2c_3x3'
        self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1,
                                       name=name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'MaxPool3d_3a_3x3'
        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
                                                             padding=0)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_3b'
        self.end_points[end_point] = InceptionModule(192, [64,96,128,16,32,32], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_3c'
        self.end_points[end_point] = InceptionModule(256, [128,128,192,32,96,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'MaxPool3d_4a_3x3'
        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2),
                                                             padding=0)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4b'
        self.end_points[end_point] = InceptionModule(128+192+96+64, [192,96,208,16,48,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4c'
        self.end_points[end_point] = InceptionModule(192+208+48+64, [160,112,224,24,64,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4d'
        self.end_points[end_point] = InceptionModule(160+224+64+64, [128,128,256,24,64,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4e'
        self.end_points[end_point] = InceptionModule(128+256+64+64, [112,144,288,32,64,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4f'
        self.end_points[end_point] = InceptionModule(112+288+64+64, [256,160,320,32,128,128], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'MaxPool3d_5a_2x2'
        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2),
                                                             padding=0)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_5b'
        self.end_points[end_point] = InceptionModule(256+320+128+128, [256,160,320,32,128,128], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_5c'
        self.end_points[end_point] = InceptionModule(256+320+128+128, [384,192,384,48,128,128], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Logits'
        self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7],
                                     stride=(1, 1, 1))
        self.dropout = nn.Dropout(dropout_keep_prob)
        self.logits  = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
                             kernel_shape=[1, 1, 1],
                             padding=0,
                             activation_fn=None,
                             use_batch_norm=False,
                             use_bias=True,
                             name='logits')

        self.build()


    def replace_logits(self, num_classes):
        self._num_classes = num_classes
        self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
                             kernel_shape=[1, 1, 1],
                             padding=0,
                             activation_fn=None,
                             use_batch_norm=False,
                             use_bias=True,
                             name='logits')

    def build(self):
        for k in self.end_points.keys():
            self.add_module(k, self.end_points[k])

    def forward(self, x, pretrained=False, n_tune_layers=-1):
        if pretrained:
            assert n_tune_layers >= 0

            freeze_endpoints = self.VALID_ENDPOINTS[:-n_tune_layers]
            tune_endpoints = self.VALID_ENDPOINTS[-n_tune_layers:]
        else:
            freeze_endpoints = []
            tune_endpoints = self.VALID_ENDPOINTS
        with torch.no_grad():
            for end_point in freeze_endpoints:
                if end_point in self.end_points:
                    x = self._modules[end_point](x)
        for end_point in tune_endpoints:
            if end_point in self.end_points:
                x = self._modules[end_point](x)
        x = self.logits(self.dropout(self.avg_pool(x)))
        if self._spatial_squeeze:
            logits = x.squeeze(3).squeeze(3)
        return logits

    def extract_features(self, x):
        for end_point in self.VALID_ENDPOINTS:
            if end_point in self.end_points:
                x = self._modules[end_point](x)
        return self.avg_pool(x)

# Inference

In [None]:
import math
import os
import argparse
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
import cv2
from itertools import chain

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
parser = argparse.ArgumentParser()
parser.add_argument('-mode', type=str, help='rgb or flow')
parser.add_argument('-save_model', type=str)
parser.add_argument('-root', type=str)

def create_WLASL_dictionary():
    global wlasl_dict
    wlasl_dict = {}
    with open('/content/WLASL-Recognition-and-Translation/WLASL/I3D/preprocess/wlasl_class_list.txt') as file:
        for line in file:
            split_list = line.split()
            if len(split_list) != 2:
                key = int(split_list[0])
                value = split_list[1] + " " + split_list[2]
            else:
                key = int(split_list[0])
                value = split_list[1]
            wlasl_dict[key] = value

def run_on_tensor(ip_tensor):
    ip_tensor = ip_tensor[None, :]
    t = ip_tensor.shape[2]
    ip_tensor.cuda()
    per_frame_logits = i3d(ip_tensor)
    predictions = F.interpolate(per_frame_logits, t, mode='linear')
    predictions = predictions.transpose(2, 1)
    out_labels = np.argsort(predictions.cpu().detach().numpy()[0])
    arr = predictions.cpu().detach().numpy()[0]
    if max(F.softmax(torch.from_numpy(arr[0]), dim=0)) >= 0.25:
        return wlasl_dict[out_labels[0][-1]]
    else:
        return " "


def load_rgb_frames_from_video(path, offset, batch, overlap=32):
    vidcap = cv2.VideoCapture(path)
    frames = []
    text_list = []
    text_0 = ''
    while True:
        ret, frame1 = vidcap.read()
        offset += 1
        font = cv2.FONT_HERSHEY_TRIPLEX
        if ret:
            w, h, c = frame1.shape
            sc = 224 / w
            sx = 224 / h
            frame = cv2.resize(frame1, dsize=(0, 0), fx=sx, fy=sc)
            frame1 = cv2.resize(frame1, dsize=(1280, 720))
            frame = (frame / 255.) * 2 - 1
            frames.append(frame)

            if len(frames) > batch:
                frames.pop(0)

            if offset >= batch and (offset - batch) % overlap == 0:
                text = run_on_tensor(torch.from_numpy((np.asarray(frames, dtype=np.float32)).transpose([3, 0, 1, 2])))
                if text != " " and text != text_0:
                    text_0 = text
                    text_list.append(text)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        else:
            break
    vidcap.release()
    cv2.destroyAllWindows()
    return text_list

def load_model(weights, num_classes):
    global i3d
    i3d = InceptionI3d(400, in_channels=3)
    i3d.replace_logits(num_classes)
    i3d.load_state_dict(torch.load(weights))
    i3d.cuda()
    i3d = nn.DataParallel(i3d)
    i3d.eval()

In [None]:
num_classes = 1000
weights = '/content/WLASL-Recognition-and-Translation/WLASL/I3D/archived/asl1000/FINAL_nslt_1000_iters=5104_top1=47.33_top5=76.44_top10=84.33.pt'
# weights = '/content/WLASL-Recognition-and-Translation/WLASL/I3D/archived/asl100/FINAL_nslt_100_iters=896_top1=65.89_top5=84.11_top10=89.92.pt'
# weights = '/content/WLASL-Recognition-and-Translation/WLASL/I3D/archived/asl300/FINAL_nslt_300_iters=2997_top1=56.14_top5=79.94_top10=86.98.pt'
create_WLASL_dictionary()
load_model(weights, num_classes)

In [None]:
video_path = '/content/GoToSeeDoctor.mp4'
offset = 0
batch = 64
text_list = load_rgb_frames_from_video(video_path, offset, batch)
print(text_list)

[]


In [None]:
# from keytotext import pipeline

# nlp = pipeline("k2t-new")
# params = {"do_sample":True, "num_beams": 5, "no_repeat_ngram_size":2, "early_stopping":True}

In [None]:
# num_word = 5
# overlap = 2

# for index in range(0, len(text_list) - num_word + 1, num_word - overlap):
#     text_chunk = text_list[index:index + num_word]
#     result = nlp(text_chunk, **params)
#     print(result)

In [None]:
# Use the Converse API to send a text message to Claude 2.

import boto3
from botocore.exceptions import ClientError

# Create a Bedrock Runtime client in the AWS Region you want to use.
client = boto3.client("bedrock-runtime", region_name="us-east-1", aws_access_key_id = 'AKIAYS2NWM6JGXQ4T65K', aws_secret_access_key = 'f0UhnXDbWHkeo1jeAYlP2ZxOiM3ptPYA7jPgZLuy')

# Set the model ID, e.g., Titan Text Premier.
model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0"

# Start a conversation with the user message.
user_message = f"""
Your task is to create sentences based on sign language gloss, considering user emotion and past conversation as context, create the sentences that captures all relevant context and accurately represent the meaning of the sign language glosses:
Glosses:  {text_list}
context of the previous conversation: ' '
Detected user emotion: Neutral
If there's any sign that doesn't make sense, you can ignore it.
Do not add any information that is not stated in the signs
"""
conversation = [
    {
        "role": "user",
        "content": [{"text": user_message}],
    }
]

try:
    # Send the message to the model, using a basic inference configuration.
    response = client.converse(
        modelId="anthropic.claude-3-5-sonnet-20240620-v1:0",
        messages=conversation,
        inferenceConfig={"maxTokens":2048,"stopSequences":["\n\nHuman:"],"temperature":0.5,"topP":1},
        additionalModelRequestFields={"top_k":250}
    )

    # Extract and print the response text.
    response_text = response["output"]["message"]["content"][0]["text"]
    print(response_text)

except (ClientError, Exception) as e:
    print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
    exit(1)


I apologize, but there are no sign language glosses provided in your input. Without any signs to interpret, I cannot generate a sentence that accurately represents their meaning. Additionally, there is no context from a previous conversation given. If you'd like me to create a sentence based on sign language glosses, please provide the glosses and any relevant context, and I'll be happy to assist you.


In [None]:
def call_llm(text_list, emotion, context):
  client = boto3.client("bedrock-runtime", region_name="us-east-1", aws_access_key_id = 'AKIAYS2NWM6JGXQ4T65K', aws_secret_access_key = 'f0UhnXDbWHkeo1jeAYlP2ZxOiM3ptPYA7jPgZLuy')

  # Set the model ID, e.g., Titan Text Premier.
  model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0"

  # Start a conversation with the user message.
  user_message = f"""
  Your task is to create sentences based on sign language gloss, considering user emotion and past conversation as context, create the sentences that captures all relevant context and accurately represent the meaning of the sign language glosses:
  Glosses:  {text_list}
  context of the previous conversation: '{context}'
  Detected user emotion: {emotion}
  #If there's any sign that doesn't make sense or the context is unclear, NEVER Include them in the sentence.
  Do not add any information that is not stated in the signs.
  Give priority to the context rather than the sign itself, since they're detected by a model and are prone to error.
  for example, if the context is about past action, the sentence should reflect that.
  #Do not explain what you omit.
  """
  conversation = [
      {
          "role": "user",
          "content": [{"text": user_message}],
      }
  ]

  try:
      # Send the message to the model, using a basic inference configuration.
      response = client.converse(
          modelId="anthropic.claude-3-5-sonnet-20240620-v1:0",
          messages=conversation,
          inferenceConfig={"maxTokens":2048,"stopSequences":["\n\nHuman:"],"temperature":0,"topP":1},
          additionalModelRequestFields={"top_k":250}
      )

      # Extract and print the response text.
      response_text = response["output"]["message"]["content"][0]["text"]
      print(response_text)

  except (ClientError, Exception) as e:
      print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
      exit(1)
  return response_text

# Deployment

In [1]:
import math
import os
import argparse
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
import cv2
from itertools import chain
import gradio as gr
from fer import FER
import tempfile
from collections import Counter

# from keytotext import pipeline

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
parser = argparse.ArgumentParser()
parser.add_argument('-mode', type=str, help='rgb or flow')
parser.add_argument('-save_model', type=str)
parser.add_argument('-root', type=str)

def create_WLASL_dictionary():
    global wlasl_dict
    wlasl_dict = {}
    with open('/content/WLASL-Recognition-and-Translation/WLASL/I3D/preprocess/wlasl_class_list.txt') as file:
        for line in file:
            split_list = line.split()
            if len(split_list) != 2:
                key = int(split_list[0])
                value = split_list[1] + " " + split_list[2]
            else:
                key = int(split_list[0])
                value = split_list[1]
            wlasl_dict[key] = value

def run_on_tensor(ip_tensor):
    ip_tensor = ip_tensor[None, :]
    t = ip_tensor.shape[2]
    ip_tensor.cuda()
    per_frame_logits = i3d(ip_tensor)
    predictions = F.interpolate(per_frame_logits, t, mode='linear')
    predictions = predictions.transpose(2, 1)
    out_labels = np.argsort(predictions.cpu().detach().numpy()[0])
    arr = predictions.cpu().detach().numpy()[0]
    if max(F.softmax(torch.from_numpy(arr[0]), dim=0)) >= 0.25:
        return wlasl_dict[out_labels[0][-1]]
    else:
        return " "

def load_rgb_frames_from_video(path, offset, batch, overlap=12):
    vidcap = cv2.VideoCapture(path)
    frames = []
    text_list = []
    text_0 = ''
    while True:
        ret, frame1 = vidcap.read()
        offset += 1
        font = cv2.FONT_HERSHEY_TRIPLEX
        if ret:
            w, h, c = frame1.shape
            sc = 224 / w
            sx = 224 / h
            frame = cv2.resize(frame1, dsize=(0, 0), fx=sx, fy=sc)
            frame1 = cv2.resize(frame1, dsize=(1280, 720))
            frame = (frame / 255.) * 2 - 1
            frames.append(frame)

            if len(frames) > batch:
                frames.pop(0)

            if offset >= batch and (offset - batch) % overlap == 0:
                text = run_on_tensor(torch.from_numpy((np.asarray(frames, dtype=np.float32)).transpose([3, 0, 1, 2])))
                if text != " " and text != text_0:
                    text_0 = text
                    text_list.append(text)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        else:
            break
    vidcap.release()
    cv2.destroyAllWindows()
    return text_list

def load_model(weights, num_classes):
    global i3d
    i3d = InceptionI3d(400, in_channels=3)
    i3d.replace_logits(num_classes)
    i3d.load_state_dict(torch.load(weights))
    i3d.cuda()
    i3d = nn.DataParallel(i3d)
    i3d.eval()

def process_video(video_path, context, overlap, batch):
    offset = 0
    text_list = load_rgb_frames_from_video(video_path, offset, batch, overlap),

    most_common_emotion = 'Neutral'
    res = call_llm(text_list, most_common_emotion, context)
    return res

def main():
    num_classes = 1000
    if num_classes == 300:
        weights = '/content/WLASL-Recognition-and-Translation/WLASL/I3D/archived/asl300/FINAL_nslt_300_iters=2997_top1=56.14_top5=79.94_top10=86.98.pt'
    else:
        weights = '/content/WLASL-Recognition-and-Translation/WLASL/I3D/archived/asl1000/FINAL_nslt_1000_iters=5104_top1=47.33_top5=76.44_top10=84.33.pt'
    create_WLASL_dictionary()
    load_model(weights, num_classes)

    iface = gr.Interface(
        fn=process_video,
        inputs=[gr.Video(autoplay=True, loop=True),gr.Textbox(), gr.Slider(1, 64, value=32, label="Overlap", info=""),gr.Slider(1, 128, value=64, label="batch"),],
        outputs=gr.Textbox(),
        title="Sign Language Detection and Translation",
        description="Upload a video to detect sign language and generate text."
    )

    iface.launch(debug=True)

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'gradio'