In [4]:
import pandas as pd
import os

skill_def = pd.read_csv('resources/skill_definitions.csv')
skill_def
prof_level = pd.read_csv('resources/Proficiency Levels (1).xlsx - Sheet1.csv')
prof_level

Unnamed: 0,Proficiency Code,Proficiency Level,Proficiency Description
0,1,Awareness,Demonstrates foundational understanding of con...
1,2,Basic,Demonstrates ability to apply basic understand...
2,3,Intermediate,Demonstrates ability to apply understanding of...
3,4,Advanced,Demonstrates ability to apply understanding of...
4,5,Expert,Demonstrates ability to apply understanding of...


### First approach using huggingface classifier (bad)

In [2]:
from transformers import pipeline

input_text = """
The image sequence appears to be a presentation slide or series of slides related to the Federal Aviation Administration (FAA), prominently featuring the FAA logo—a globe with a winged emblem and the text "Federal Aviation Administration" encircling it. The slides vary slightly in design, with some including airplane icons and a text bar reading "This is the FAA." The background is a gradient of blue shades. The slides emphasize the FAA\'s role in aviation, displaying statistics such as 5,000 flights in the air at any given time and 44,000 flights each day, with 2.6 million passengers traveling daily and 9.7 million scheduled passenger flights annually. Key organizations highlighted include the Air Traffic Organization (ATO), Office of Airports (ARP), Aviation Safety (AVS), Security and Hazardous Materials (ASH), and Commercial Space Transportation (AST), underscoring the complexity and scale of air travel management and safety.
"""

# Load a pre-trained zero-shot classification model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define your classification labels and their desciptions
labels = skill_def['Skill'] + ' (' + skill_def['Skill Type'] + ')'

# Perform zero-shot classification
result = classifier(input_text, candidate_labels=labels)

# Display the results
print("Input Text:", input_text)
print("Predicted Labels with Scores:")
for label, score in zip(result["labels"], result["scores"]):
    print(f"{label}: {score:.4f}")

# Optional: Save results to a DataFrame for further analysis
results_df = pd.DataFrame({
    "Label": result["labels"],
    "Score": result["scores"]
}).sort_values(by='Score', ascending=False)
print("\nResults DataFrame:")
print(results_df)

: 

: 

### Setting up reading metadata

In [25]:
from openai import OpenAI
import json, time

class TextLabeler:
    
    def __init__(self, data_path: str, skill_def: pd.DataFrame, proficiency_def: pd.DataFrame, client: OpenAI, model: str, max_retries: int = 3, delay: float = 1.0):
        self.data_path = data_path
        with open(data_path, 'r') as file:
            self.data = json.load(file)
        print()
        self.skill_def = skill_def
        self.proficiency_def = proficiency_def  # ✅ FIX HERE
        self.client = client
        self.model = model
        self.max_retries = max_retries
        self.delay = delay
        print(type(self.max_retries))


    def _build_user_prompt_content(self, entry: dict):
        return '\n'.join([f"{key}: {entry[key]}" for key in ['title', 'transcription', 'summary']])
    
    def _build_system_prompt_content(self):
            skill_options = '; '.join(
                f"(id: {i}, label: {row['Skill']})" for i, row in self.skill_def.iterrows()
            )
            proficiency_options = '; '.join(
                f"(code: {row['Proficiency Code']}, level: {row['Proficiency Level']})"
                for _, row in self.proficiency_def.iterrows()
            )
            return f"""You are labeling Delta airline training content. 
        Pick the most appropriate SKILL LABELS AND ASSOCIATE EACH with a PROFICIENCY LEVEL. Tend to choose lower profiency levels
        Choose ONLY from the following:

        Skills: {skill_options}
        Proficiency Levels: {proficiency_options}

        You must output 3–5 skill IDs with their proficiency code in DESCENDING ORDER OF RELEVANCE. 

        STRICT OUTPUT FORMAT (NO EXTRA TEXT): 
        Labels: <skill_id>:<proficiency_code>, <skill_id>:<proficiency_code>, ..."""
    def build_labels(self, entry: dict):
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": self._build_system_prompt_content()},
                {"role": "user", "content": self._build_user_prompt_content(entry)},
            ],
            stream=False
        )

        raw_content = response.choices[0].message.content.strip()
        print("🔹 Raw model response:", raw_content)

        label_section = raw_content[len("Labels:"):].strip()
        label_pairs = [pair.strip() for pair in label_section.split(',') if ':' in pair]

        skill_proficiency = []
        for pair in label_pairs:
            try:
                skill_id, prof_code = map(int, pair.split(':'))
                skill_row = self.skill_def.iloc[skill_id]
                prof_row = self.proficiency_def[self.proficiency_def['Proficiency Code'] == prof_code].iloc[0]

                skill_proficiency.append({
                    'Skill': skill_row['Skill'],
                    'Proficiency': prof_row['Proficiency Level'],
                    'Description': prof_row['Proficiency Description']
                })
            except Exception as e:
                print(f"⚠️ Skipping malformed label pair '{pair}': {e}")
                continue

        return skill_proficiency

    def revise_metadata_labels(self):
        for entry in self.data['data']:
            print(f"Generating labels for {entry['title']}...")
            for attempt in range(self.max_retries):
                try:
                    new_labels = self.build_labels(entry)
                except:
                    print("An error has occured.")
                    if attempt == self.max_retries - 1:
                        raise
                    time.sleep(self.delay * (attempt + 1))
                else:
                    break
            entry['labeled_skills'] = new_labels
            
            # For saving progress
            with open(self.data_path, 'w') as file:
                json.dump(self.data, file, indent=4)
            print(f"Added labels for {entry['title']}")
        print('Added all entry labels successfully.')

### Second approach using deepseek chat (much better results)

In [26]:
deep_seek_client = OpenAI(api_key="sk-REDACTED", base_url="https://api.deepseek.com")
deep_seek_tl = TextLabeler('resources/metadata.json', skill_def, prof_level, deep_seek_client, 'deepseek-chat')
deep_seek_tl.revise_metadata_labels() 


<class 'int'>
Generating labels for Fire Extinguisher Safety - AFIRE100.mp4...
🔹 Raw model response: Labels: 360:2, 277:2, 117:2, 30:2
Added labels for Fire Extinguisher Safety - AFIRE100.mp4
Generating labels for fingers.mp4...
🔹 Raw model response: Labels: 17:2, 136:2, 261:2, 93:2, 342:2
Added labels for fingers.mp4
Generating labels for Information Security Awareness - Standard (Recurrent).mp4...
🔹 Raw model response: Labels: 90:1, 91:1, 161:1, 207:1, 265:1
Added labels for Information Security Awareness - Standard (Recurrent).mp4
Added all entry labels successfully.


### Third approach using gpt 4o mini (slightly worse results)

In [3]:
gpt_4o_mini_client = OpenAI(api_key="sk-REDACTED")
gpt_4o_mini_tl = TextLabeler('resources/demo1.json', skill_def, gpt_4o_mini_client, 'gpt-4o-mini')
gpt_4o_mini_tl.revise_metadata_labels()

Generating labels for Fingerprint Technician - Fingerprint Capture Training...
Labels: 17, 18, 340, 40, 7
Added labels for Fingerprint Technician - Fingerprint Capture Training
Generating labels for CCT - Privacy and Data Protection...
Labels: 265, 340, 370, 374, 219
An error has occured.
Labels: 65, 268, 272, 216, 280
Added labels for CCT - Privacy and Data Protection
Generating labels for Fire Extinguisher Safety...
Labels: 117, 275, 277, 340
Added labels for Fire Extinguisher Safety
Generating labels for Delta Incident Reporting training course video capture (NO AUDIO)...
Labels: 30, 275, 278, 340, 336
Added labels for Delta Incident Reporting training course video capture (NO AUDIO)
Generating labels for Corporate Safety - Incident Investigation...
Labels: 30, 275, 277, 72, 65
Added labels for Corporate Safety - Incident Investigation
Generating labels for Interview Skills For Hiring Managers...
Labels: 9, 340, 37, 40, 221
Added labels for Interview Skills For Hiring Managers
Gener

### Fourth approach using gpt 4.1 (about the same as deepseek)

In [6]:
gpt_4_client = OpenAI(api_key="sk-REDACTED")
gpt_4_tl = TextLabeler('resources/bedrock_demo.json', skill_def, gpt_4_client, 'gpt-4.1-2025-04-14')
gpt_4_tl.revise_metadata_labels()

Generating labels for Fingerprint Technician - Fingerprint Capture Training...
Labels: 340, 329, 17, 261
Added labels for Fingerprint Technician - Fingerprint Capture Training
Generating labels for CCT - Privacy and Data Protection...
Labels: 265, 97, 340, 346, 277
Added labels for CCT - Privacy and Data Protection
Generating labels for Fire Extinguisher Safety...
Labels: 277, 117, 360, 7
Added labels for Fire Extinguisher Safety
Generating labels for Delta Incident Reporting training course video capture (NO AUDIO)...
Labels: 174, 360, 275, 277, 117
Added labels for Delta Incident Reporting training course video capture (NO AUDIO)
Generating labels for Corporate Safety - Incident Investigation...
Labels: 174, 277, 275, 360, 230
Added labels for Corporate Safety - Incident Investigation
Generating labels for Interview Skills For Hiring Managers...
Labels: 340, 9, 319, 181
Added labels for Interview Skills For Hiring Managers
Generating labels for Information Security Awareness - Standa

In [14]:
import cv2  
import base64
from openai import OpenAI

client = OpenAI(api_key='sk-REDACTED')

def query_model(video_path, prompt):
    video = cv2.VideoCapture(video_path)
    
    base64Frames = []
    while video.isOpened():
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))

    PROMPT_MESSAGES = [
        {
            "role": "user",
            "content": [
                prompt,
                *map(lambda x: {"image": x, "resize": 768}, base64Frames[0::100000]),
            ],
            'detail': 'high',
        },
    ]
    params = {
        "model": "gpt-4o-mini",
        "messages": PROMPT_MESSAGES,
    }

    result = client.chat.completions.create(**params)
    return result.choices[0].message.content

In [15]:
def build_system_prompt_content(skill_def, proficiency_def):
        skill_options = '; '.join(
            f"(id: {i}, label: {row['Skill']})" for i, row in skill_def.iterrows()
        )
        proficiency_options = '; '.join(
            f"(code: {row['Proficiency Code']}, level: {row['Proficiency Level']})"
            for _, row in proficiency_def.iterrows()
        )
        return f"""You are labeling Delta airline training content. 
        Pick the most appropriate SKILL LABELS AND ASSOCIATE EACH with a PROFICIENCY LEVEL. Tend to choose lower profiency levels
        Choose ONLY from the following:

        Skills: {skill_options}
        Proficiency Levels: {proficiency_options}

        You must output 3–5 skill IDs with their proficiency code in DESCENDING ORDER OF RELEVANCE. 

        STRICT OUTPUT FORMAT (NO EXTRA TEXT): 
        Labels: <skill_id>:<proficiency_code>, <skill_id>:<proficiency_code>, ..."""

video_path = '/Users/faithqiao/Downloads/saas-delta/resources/fingers.mp4'

prompt = build_system_prompt_content(skill_def, prof_level)
response = query_model(video_path, prompt)
response


OpenCV: Couldn't read video stream from file "/Users/faithqiao/Downloads/saas-delta/resources/fingers.mp4"


BadRequestError: Error code: 400 - {'error': {'message': "Invalid type for 'messages[0].content[0]': expected an object, but got a string instead.", 'type': 'invalid_request_error', 'param': 'messages[0].content[0]', 'code': 'invalid_type'}}

In [16]:
import json
import cv2
import base64
from openai import OpenAI
def get_sampled_video_frames(video_path, max_frames=3):
    video = cv2.VideoCapture(video_path)
    base64_frames = []

    while video.isOpened():
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame, [int(cv2.IMWRITE_JPEG_QUALITY), 40])
        base64_frames.append(base64.b64encode(buffer).decode("utf-8"))
    video.release()

    step = max(1, len(base64_frames) // max_frames)
    return base64_frames[::step]

# --- Prompt builders ---
def build_system_prompt(skill_def: pd.DataFrame, proficiency_def: pd.DataFrame) -> str:
    skill_options = '; '.join(
        f"(id: {i}, label: {row['Skill']})" for i, row in skill_def.iterrows()
    )
    proficiency_options = '; '.join(
        f"(code: {row['Proficiency Code']}, level: {row['Proficiency Level']})"
        for _, row in proficiency_def.iterrows()
    )
    return f"""You are labeling Delta airline training content. 
Pick the most appropriate SKILL LABELS AND ASSOCIATE EACH with a PROFICIENCY LEVEL. Tend to choose lower profiency levels
Choose ONLY from the following:

Skills: {skill_options}
Proficiency Levels: {proficiency_options}

You must output 3–5 skill IDs with their proficiency code in DESCENDING ORDER OF RELEVANCE. 

STRICT OUTPUT FORMAT (NO EXTRA TEXT): 
Labels: <skill_id>:<proficiency_code>, <skill_id>:<proficiency_code>, ..."""

def label_video(video_path, skill_def, proficiency_def, model="gpt-4o-mini"):
    frames = get_sampled_video_frames(video_path)
    system_prompt = build_system_prompt(skill_def, proficiency_def)
    user_prompt = f"This is training content from Delta Airlines. Please analyze the video frames from {os.path.basename(video_path)}."

    messages = [
        {"role": "system", "content": system_prompt},
        {
            "role": "user",
            "content": [
                user_prompt,
                *[{"image": img, "resize": 768} for img in frames],
            ],
        },
    ]

    response = client.chat.completions.create(model=model, messages=messages)
    raw_output = response.choices[0].message.content.strip()

    # Parse output
    label_text = raw_output[len("Labels:"):].strip() if raw_output.startswith("Labels:") else raw_output
    pairs = [pair.strip() for pair in label_text.split(',') if ':' in pair]

    readable_labels = []
    for pair in pairs:
        try:
            skill_id, _ = map(int, pair.split(":"))
            skill_name = skill_def.iloc[skill_id]['Skill']
            readable_labels.append(skill_name)
        except Exception as e:
            print(f"⚠️ Could not parse label pair '{pair}': {e}")
            continue

    return {
        "raw": raw_output,
        "labels": readable_labels
    }


# --- Main function to walk folder and save to JSON ---
def label_all_videos_in_folder(folder_path, skill_def, proficiency_def, output_path="labeled_data.json"):
    dataset = {"data": []}
    video_files = [f for f in os.listdir(folder_path) if f.endswith(".mp4")]

    for filename in video_files:
        video_path = os.path.join(folder_path, filename)
        print(f"🔍 Processing {filename}...")
        try:
            labels = label_video(video_path, skill_def, proficiency_def)
        except Exception as e:
            print(f"⚠️ Failed on {filename}: {e}")
            labels = "ERROR"

        dataset["data"].append({
            "title": filename,
            "video_path": video_path,
            "labeled_skills": labels
        })

        # Save after each to prevent loss
        with open(output_path, "w") as f:
            json.dump(dataset, f, indent=2)

    print("✅ All videos processed and saved to", output_path)

In [17]:
label_all_videos_in_folder("resources/videos", skill_def, prof_level)

🔍 Processing Fire Extinguisher Safety - AFIRE100.mp4...
✅ All videos processed and saved to labeled_data.json
