# Video Feature Extraction Tool Task

#Packages

In [7]:
!apt-get update
!apt-get install -y tesseract-ocr
!pip install pytesseract --quiet
!pip install pytesseract ultralytics --quiet

0% [Working]            Hit:1 https://cli.github.com/packages stable InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.81)] [Connecting to security.ub                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.                                                                               Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.                                                                               Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://r2u.st

In [8]:
import cv2
import json
import numpy as np
import pytesseract
from ultralytics import YOLO
from google.colab import files
import urllib.request
from pathlib import Path
import matplotlib.pyplot as plt
import os


#Video Upload

In [41]:
uploaded = files.upload()
video_path = list(uploaded.keys())[0]

Saving A one minute TEDx Talk for the digital age _ Woody Roseland _ TEDxMileHigh.mp4 to A one minute TEDx Talk for the digital age _ Woody Roseland _ TEDxMileHigh.mp4


#Working Functions

In [46]:
class SimpleVideoAnalyzer:
    def __init__(self, video_path):

        self.video_path = video_path
        self.cap = cv2.VideoCapture(video_path)

        # Get video properties
        self.fps = self.cap.get(cv2.CAP_PROP_FPS)
        self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
        self.width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        self.height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    print(f"Video: {Path(video_path).name}")

    def detect_cuts(self, threshold=0.4, sample_rate=10):
        print("Looking for shot cuts...", end='')
        self.cap.set(cv2.CAP_PROP_POS_FRAMES, 0)

        cuts_found = []
        previous_hist = None
        frame_no = 0

        while True:
            ok, frame = self.cap.read()
            if not ok:
                break

            #frame interval
            if frame_no % sample_rate == 0:
                gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                hist = cv2.calcHist([gray_frame], [0], None, [64], [0, 256])
                hist = cv2.normalize(hist, hist).flatten()

                if previous_hist is not None:
                    similarity = cv2.compareHist(previous_hist, hist, cv2.HISTCMP_CORREL)

                    # if hist diff too large it's a cut
                    if similarity < threshold:
                        cuts_found.append({
                            'frame': frame_no,
                            'time': round(frame_no / self.fps, 2),
                            'similarity': round(float(similarity), 3)
                        })

                previous_hist = hist

            #Progress
            if frame_no % 100 == 0:
                print('.', end='')

            frame_no += 1


        total_cuts = len(cuts_found)
        cuts_per_min = round(total_cuts / (self.total_frames / self.fps / 60), 2) if self.fps > 0 else 0

        return {
            'total_cuts': total_cuts,
            'cuts_per_minute': cuts_per_min,
            'sample_cuts': cuts_found[:10]
        }

    def analyze_motion(self, sample_rate=30):
        print("Analyzing motion.", end='')
        self.cap.set(cv2.CAP_PROP_POS_FRAMES, 0)

        ok, first_frame = self.cap.read()
        if not ok:
            return {'error': 'Could not grab first frame'}

        prev_gray = cv2.cvtColor(first_frame, cv2.COLOR_BGR2GRAY)
        motion_values = []
        idx = 0

        while True:
            ret, frame = self.cap.read()
            if not ret:
                break

            if idx % sample_rate == 0:
                gray_now = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                corners = cv2.goodFeaturesToTrack(prev_gray, 100, 0.3, 7)

                if corners is not None:
                    next_pts, status, _ = cv2.calcOpticalFlowPyrLK(prev_gray, gray_now, corners, None)
                    if next_pts is not None and status is not None:
                        old = corners[status == 1]
                        new = next_pts[status == 1]

                        if len(old) > 0:
                            motion_amt = np.mean(np.linalg.norm(new - old, axis=1))
                            motion_values.append(float(motion_amt))

                prev_gray = gray_now

            if idx % 100 == 0:
                print('.', end='')

            idx += 1

        if not motion_values:
            return {'average_motion': 0, 'motion_level': 'none'}

        avg_motion = np.mean(motion_values)
        motion_tag = 'high' if avg_motion > 10 else 'medium' if avg_motion > 5 else 'low'

        return {
            'average_motion': round(float(avg_motion), 2),
            'max_motion': round(float(np.max(motion_values)), 2),
            'min_motion': round(float(np.min(motion_values)), 2),
            'motion_level': motion_tag
        }

    def detect_text(self, sample_rate=60):
        print("Checking for on-screen text.", end='')
        self.cap.set(cv2.CAP_PROP_POS_FRAMES, 0)

        frames_with_text = 0
        checked = 0
        found_words = []
        frame_counter = 0

        while True:
            success, frame = self.cap.read()
            if not success:
                break

            if frame_counter % sample_rate == 0:
                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)

                txt = pytesseract.image_to_string(binary, config='--psm 11').strip()
                if len(txt) > 5:
                        frames_with_text += 1
                        words = [w.lower() for w in txt.split() if len(w) > 3 and w.isalpha()]
                        found_words.extend(words[:5])
                        checked += 1


                if frame_counter % 300 == 0:
                    print('.', end='')

            frame_counter += 1

        ratio = frames_with_text / max(checked, 1)
        unique_words = list(set(found_words))[:10]

        return {
            'text_frame_ratio': round(float(ratio), 3),
            'frames_with_text': frames_with_text,
            'frames_checked': checked,
            'example_words': unique_words
        }

    def detect_objects_and_people(self, sample_rate=30):
          print("Detecting ppl/obj using YOLO model", end='')

          sample_rate = int(sample_rate)
          yolo_model = YOLO('yolov8n.pt')
          cap = cv2.VideoCapture(self.video_path)
          total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

          people = 0
          objects = 0
          frames_seen = 0

          for n in range(0, total, sample_rate):
              cap.set(cv2.CAP_PROP_POS_FRAMES, n)
              ret, frame = cap.read()
              if not ret:
                  break

              preds = yolo_model(frame, verbose=False)
              frames_seen += 1

              for res in preds:
                  for box in res.boxes:
                      cid = int(box.cls[0])
                      if cid == 0:
                          people += 1
                      else:
                          objects += 1
          total_detections = people + objects
          return {
              'people_count': people,
              'object_count': objects,
              'total_detections': total_detections,
              'person_ratio': round(people / max(total_detections, 1), 2),
              'frames_analyzed': frames_seen
          }

    def extract_all_features(self):
            features = {}

            features['video_info'] = {
                'filename': os.path.basename(self.video_path),
                'fps': self.fps,
                'total_frames': self.total_frames,
                'width': self.width,
                'height': self.height
            }


            features['shot_cuts'] = self.detect_cuts()
            features['motion_analysis'] = self.analyze_motion()
            features['text_detection'] = self.detect_text()
            features['object_person_detection'] = self.detect_objects_and_people()
            self.cap.release()

            return features


Video: A one minute TEDx Talk for the digital age _ Woody Roseland _ TEDxMileHigh.mp4


In [47]:
analyzer = SimpleVideoAnalyzer(video_path)
results = analyzer.extract_all_features()


Looking for shot cuts.......................Analyzing motion.....................Checking for on-screen text........Detecting ppl/obj using YOLO model

In [44]:
import json
from google.colab import files

def make_json_safe(obj):
    if isinstance(obj, set):
        return list(obj)
    elif isinstance(obj, dict):
        return {k: make_json_safe(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [make_json_safe(v) for v in obj]
    else:
        return obj

# Convert results to JSON-safe format
safe_results = make_json_safe(results)

# Save to JSON file
output_filename = 'video_output.json'
with open(output_filename, 'w') as f:
    json.dump(safe_results, f, indent=2)

print(f"Results saved to {output_filename}")

# Download the file
files.download(output_filename)
print("File downloaded!")


Results saved to video_output.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

File downloaded!
