In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
%cd /content/drive/MyDrive/assessmentanalysis/

/content/drive/MyDrive/assessmentanalysis


In [3]:
#Install the required packages for this project
!pip install --upgrade pip
!pip install --upgrade transformers datasets[audio] accelerate
!pip install ffmpeg-python
!pip install firebase-admin

Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.3.1
Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Collecting datasets[audio]
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets[audio])
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets[audio])
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets[audio])
  Downloading multiprocess-0.70.16-py310-none-an

## Data Extration

In [4]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline , AutoProcessor, AutoModelForCausalLM
import cv2
import os
import ffmpeg
from PIL import Image

In [5]:
import firebase_admin
from firebase_admin import credentials
from firebase_admin import storage
from firebase_admin import firestore

cred = credentials.Certificate('videoanalysis-d5eb4-firebase-adminsdk-u5vqo-754ba234c2.json')
firebase_admin.initialize_app(cred, {
    'storageBucket': 'videoanalysis-d5eb4.firebasestorage.app'
})

bucket = storage.bucket()
db = firestore.client()

In [6]:
file_name="AngularTutorial10.mp4"

In [7]:
if os.path.exists(file_name):
  print(f"The file '{file_name}' exists.")
else:
  video_path_in_firebase = "videos/"+file_name  # Path
  local_video_path = file_name  # Local path to download
  blob = bucket.blob(video_path_in_firebase)  # Get blob
  blob.download_to_filename(local_video_path)
  print(f"Video downloaded to: {local_video_path}")

The file 'AngularTutorial10.mp4' exists.


In [8]:
video_name = os.path.basename(file_name)
video_name_without_extension = os.path.splitext(video_name)[0]

In [9]:
!pip install openai-whisper

import whisper
model = whisper.load_model('large')

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting triton>=2.0.0 (from openai-whisper)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m61.8 MB/s[0m eta [36m0:

100%|█████████████████████████████████████| 2.88G/2.88G [00:43<00:00, 70.8MiB/s]
  checkpoint = torch.load(fp, map_location=device)


In [10]:
def convert_mp4_to_wav(input_file, output_file):
  """Converts an MP4 file to a WAV file.

  Args:
    input_file: Path to the input MP4 file.
    output_file: Path to the output WAV file.
  """
  try:
    (
        ffmpeg
        .input(input_file)
        .output(output_file, acodec='pcm_s16le', ac=1, ar='16k')
        .run(overwrite_output=True)
    )
    print(f"Conversion successful: {input_file} -> {output_file}")
  except ffmpeg.Error as e:
    print(f"Error during conversion: {e}")

# Check and insert in the 'text' collection
text_doc_ref = db.collection('text').document(video_name_without_extension)
text_doc = text_doc_ref.get()
audiofile=video_name_without_extension+".wav"

if not text_doc.exists:
  # Using the convert_mp4_to_wav function
  audio_file_exist=os.path.exists(audiofile)
  if not audio_file_exist:
    input_mp4_file = file_name
    convert_mp4_to_wav(input_mp4_file, audiofile)

  data=model.transcribe(audio=audiofile, language='en', verbose=True)
  audio_chunks=data["segments"]

  text_data = {
      "filename": video_name_without_extension,
      "content": audio_chunks
  }

  text_doc_ref.set(text_data)
  print(f"Document added to 'text' collection for filename: {video_name_without_extension}")
else:
  print(f"Document already exists in 'text' collection for filename: {video_name_without_extension}")

Conversion successful: AngularTutorial10.mp4 -> AngularTutorial10.wav
[00:00.000 --> 00:06.920]  Alright guys, in this video we are going to learn about template reference variables.
[00:06.920 --> 00:13.180]  Now when there is a user interaction, we might want some data to flow from the view to the
[00:13.180 --> 00:16.020]  class to perform an operation.
[00:16.020 --> 00:22.880]  For example, we may require the value in an input field to perform some validations.
[00:22.880 --> 00:29.800]  So to easily access DOM elements and their properties, Angular provides us with template
[00:29.800 --> 00:31.820]  reference variables.
[00:31.820 --> 00:33.940]  Let's take a look at an example.
[00:33.940 --> 00:36.080]  Let's go back to Visual Studio Code.
[00:36.080 --> 00:38.720]  Let's say we have an input element.
[00:38.720 --> 00:44.600]  Input type is equal to text and a button next to it that says log.
[00:44.600 --> 00:50.640]  Alright what we want to achieve here is when the user cli

In [11]:
if not os.path.exists(video_name_without_extension):

  def extract_frames_from_video(video_path, output_folder, interval_seconds=15):
    """Extracts frames from a video at a specified interval.

    Args:
      video_path: Path to the input video file.
      output_folder: Path to the folder where extracted frames will be saved.
      interval_seconds: The time interval (in seconds) between extracted frames.
    """

    if not os.path.exists(output_folder):
      os.makedirs(output_folder)

    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(fps * interval_seconds)

    while True:
      ret, frame = cap.read()
      if not ret:
        break

      if frame_count % frame_interval == 0:
        output_path = os.path.join(output_folder, f"frame_{frame_count}.png")
        cv2.imwrite(output_path, frame)
        print(f"Frame saved: {output_path}")

      frame_count += 1

    cap.release()

  # Using extract_frames_from_video function on the video
  extract_frames_from_video(file_name, video_name_without_extension)

Frame saved: AngularTutorial10/frame_0.png
Frame saved: AngularTutorial10/frame_360.png
Frame saved: AngularTutorial10/frame_720.png
Frame saved: AngularTutorial10/frame_1080.png
Frame saved: AngularTutorial10/frame_1440.png
Frame saved: AngularTutorial10/frame_1800.png
Frame saved: AngularTutorial10/frame_2160.png
Frame saved: AngularTutorial10/frame_2520.png
Frame saved: AngularTutorial10/frame_2880.png
Frame saved: AngularTutorial10/frame_3240.png
Frame saved: AngularTutorial10/frame_3600.png
Frame saved: AngularTutorial10/frame_3960.png
Frame saved: AngularTutorial10/frame_4320.png


In [12]:
# Check and insert in the 'caption' collection
caption_doc_ref = db.collection('caption').document(video_name_without_extension)
caption_doc = caption_doc_ref.get()

In [13]:
model_id = 'microsoft/Florence-2-large'
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).eval().cuda()
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/2.44k [00:00<?, ?B/s]

configuration_florence2.py:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Florence-2-large:
- configuration_florence2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_florence2.py:   0%|          | 0.00/127k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Florence-2-large:
- modeling_florence2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

processing_florence2.py:   0%|          | 0.00/48.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Florence-2-large:
- processing_florence2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer_config.json:   0%|          | 0.00/34.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models, datasets
import os
from PIL import Image
from sklearn.model_selection import train_test_split
import shutil
import numpy as np
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data_transforms = {
   'val': transforms.Compose([
       transforms.Resize(256),
       transforms.CenterCrop(224),
       transforms.ToTensor(),
       transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
   ])
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model structure
loaded_model = models.resnet18(pretrained=False)
num_ftrs = loaded_model.fc.in_features
loaded_model.fc = nn.Linear(num_ftrs, 2)

# Load the trained weights
loaded_model.load_state_dict(torch.load('code_detect.pth'))
loaded_model = loaded_model.to(device)
loaded_model.eval()  # Set to evaluation mode

def predict_image(image_path, model):
    """
    Predicts the class of an image using the provided model.

    Args:
    image_path (str): Path to the image file.
    model (torch.nn.Module): The trained model.

    Returns:
    str: The predicted class name.
    """
    image_temp = Image.open(image_path).convert('RGB')
    image_temp = data_transforms['val'](image_temp).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(image_temp)
        _, preds = torch.max(outputs, 1)

    class_names = ['Code', 'Other']  # Define class names for classification
    return class_names[preds[0]]

  loaded_model.load_state_dict(torch.load('code_detect.pth'))


In [15]:
def transcribe_video_OCR(image):
    inputs = processor(text='<OCR>', images=image, return_tensors="pt")
    generated_ids = model.generate(
      input_ids=inputs["input_ids"].cuda(),
      pixel_values=inputs["pixel_values"].cuda(),
      max_new_tokens=256,
      early_stopping=False,
      do_sample=False,
      num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task='<OCR>',
        image_size=(image.width, image.height)
    )

    return parsed_answer['<OCR>']

In [16]:
def transcribe_video_caption(image):
    inputs = processor(text='<DETAILED_CAPTION>', images=image, return_tensors="pt")
    generated_ids = model.generate(
      input_ids=inputs["input_ids"].cuda(),
      pixel_values=inputs["pixel_values"].cuda(),
      max_new_tokens=256,
      early_stopping=False,
      do_sample=False,
      num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task='<DETAILED_CAPTION>',
        image_size=(image.width, image.height)
    )

    return parsed_answer['<DETAILED_CAPTION>']

In [17]:
all_caps=[]
for filename in os.listdir(video_name_without_extension):
  image_path = os.path.join(video_name_without_extension, filename)
  img_type=predict_image(image_path, loaded_model)
  img=Image.open(image_path)
  if img_type=='Code':
    print("Code Detected")
    text=transcribe_video_OCR(img)
    caption_data = {
          "name":filename,
          "text": text,
          "type": 'Code'
    }
    all_caps.append(caption_data)
  else:
    print("Other image Detected")
    text=transcribe_video_caption(img)
    caption_data = {
          "name":filename,
          "text": text,
          "type": 'Other'
    }
    all_caps.append(caption_data)

caption_doc_ref.set({"filename": video_name_without_extension,"data":all_caps})

Other image Detected
Other image Detected
Other image Detected
Code Detected
Code Detected
Code Detected
Code Detected
Code Detected
Other image Detected
Other image Detected
Code Detected
Other image Detected
Code Detected


update_time {
  seconds: 1733310060
  nanos: 182285000
}