In [1]:
!pip install ffmpeg-python pandas nltk seaborn matplotlib openai-whisper pandas
# ffmpeg-python: Used to extract audio from video files and convert them to mono 16kHz WAV format.
# pandas: Used for data manipulation, creating dataframes, and exporting the final transcription and analysis results as a CSV.
# nltk: Used for natural language processing tasks, specifically sentiment analysis using the VADER model.
# seaborn: Used for visualizing data, such as sentiment distribution.
# matplotlib: Used for plotting histograms and other visual representations of data.
# openai-whisper: A pre-trained model used for automatic speech recognition (ASR) to generate transcriptions from audio.

Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting triton>=2.0.0 (from openai-whisper)
  Downloading triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Downloading triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (253.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.1/253.1 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: openai-whisper
  Bu

In [2]:
!pip install "pyannote.audio[all]"
!pip install torch torchvision torchaudio
!pip install librosa
# pyannote.audio[all]: Used for speaker diarization to identify and segment speakers within the audio locally.
# torch: Core library required to run Whisper and pyannote.audio models.
# torchvision: Included as part of the PyTorch ecosystem (not directly used here but required in some environments).
# torchaudio: Handles audio processing within the PyTorch ecosystem.
# librosa: Audio analysis library; helpful for processing and analyzing audio signals (optional but useful for extra features).

Collecting pyannote.audio[all]
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
[0mCollecting asteroid-filterbanks>=0.4 (from pyannote.audio[all])
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio[all])
  Downloading lightning-2.5.1-py3-none-any.whl.metadata (39 kB)
Collecting pyannote.core>=5.0.0 (from pyannote.audio[all])
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database>=5.0.1 (from pyannote.audio[all])
  Downloading pyannote.database-5.1.3-py3-none-any.whl.metadata (1.1 kB)
Collecting pyannote.metrics>=3.2 (from pyannote.audio[all])
  Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)
Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio[all])
  Downloading pyannote.pipeline-3.0.1-py3-none-any.whl.metadata (897 bytes)
Collecting pytorch-metric-learning>=2.1.0 (from pyannote.audio[all])
  Downloading pytorch_metric_

# **Step-by-Step Workflow**

## **Step 1: Data Ingestion**
#### Loads all video files, extracts audio, and converts it to mono 16kHz WAV format for further processing.

## **Step 2: Transcription**
#### Uses Whisper (offline model) to transcribe spoken words in the audio into text along with start and end timestamps.

## **Step 3: Speaker Diarization**
#### Uses a pre-trained local model (like pyannote) to assign speaker labels to different segments of the audio, distinguishing between different voices.

## **Step 4: Time Bucketing**
#### Segments each transcription line into 5-second intervals based on start time for better temporal analysis and aggregation.

## **Step 5: Sentiment Analysis**
#### Analyzes the sentiment of each transcribed text segment using NLTK's VADER model and labels them as positive, negative, or neutral.

## **Step 6: Named Entity Recognition (NER)**
#### Applies a Hugging Face NER model to detect and extract entities (like names, places, orgs) from each text segment.

## **Step 7: Export to CSV**
#### Combines all processed results (transcription, speakers, timestamps, sentiment, entities) into a single structured CSV for analysis or visualization.

In [4]:
import os
import subprocess
import pandas as pd
import nltk
import json
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment import SentimentIntensityAnalyzer
import ffmpeg

In [6]:
nltk.download('vader_lexicon')  # Required for sentiment analysis
nltk.download('punkt')          # Tokenizer for text processing

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# **Step 1: Data Ingestion (Video to Audio Conversion)** 
#### * Load video files from the input directory.
#### * Extract audio from each video using ffmpeg.
#### * Convert audio to mono and 16kHz .wav format.
#### * Store the resulting audio files in an output folder (e.g., ./output_audio).

In [7]:
import os
import subprocess

def extract_audio(video_path, output_audio_path):
    """
    Extracts audio from a single video and saves it as mono 16kHz WAV.

    Parameters:
        video_path (str): Path to the input video file.
        output_audio_path (str): Path where the output audio will be saved.

    Returns:
        str: Path to the saved audio file.
    """
    cmd = [
        "ffmpeg", "-i", video_path, "-q:a", "0", "-map", "a",
        "-ac", "1", "-ar", "16000",  # Mono & 16kHz
        output_audio_path, "-y"
    ]
    subprocess.run(cmd, capture_output=True, text=True)
    return output_audio_path

def process_all_videos(input_video_dir):
    """
    Processes all video files in the input directory,
    extracts audio, and saves them to ./output_audio/.

    Parameters:
        input_video_dir (str): Directory containing video files.
    """
    output_audio_dir = "./output_audio"
    os.makedirs(output_audio_dir, exist_ok=True)

    video_extensions = (".mp4", ".mov", ".avi", ".mkv")

    for filename in os.listdir(input_video_dir):
        if filename.lower().endswith(video_extensions):
            video_path = os.path.join(input_video_dir, filename)
            base_name = os.path.splitext(filename)[0]
            output_audio_path = os.path.join(output_audio_dir, f"{base_name}.wav")

            extract_audio(video_path, output_audio_path)
            print(f"✅ Processed: {filename} → {output_audio_path}")

# 🟡 Call this function with your actual input folder containing videos
input_video_directory = "Your input video directory"
process_all_videos(input_video_directory)  # Change path as needed


✅ Processed: Experimenter_CREW_999_1_All_1731617801.mp4 → ./output_audio/Experimenter_CREW_999_1_All_1731617801.wav


# **Step 2: Transcription (Using Whisper)**
#### * Load a pre-trained Whisper model (base, small, etc.).
#### * Transcribe each .wav file from the audio output directory.
#### * Store transcription segments with timestamps and text.

In [8]:
import os
import whisper

def transcribe_all_audio(audio_dir):
    """
    Transcribes all .wav files in a directory using Whisper.

    Parameters:
        audio_dir (str): Path to the directory containing .wav files.

    Returns:
        dict: Mapping from audio filename to list of transcription segments.
    """
    model = whisper.load_model("base")  # You can try 'small' or 'medium' if GPU is available
    transcripts = {}

    for file in os.listdir(audio_dir):
        if file.endswith(".wav"):
            audio_path = os.path.join(audio_dir, file)
            print(f"🔍 Transcribing: {file} ...")
            result = model.transcribe(audio_path)
            transcripts[file] = result["segments"]  # Each segment: dict with 'start', 'end', 'text'
            print(f"✅ Transcribed: {file} → {len(result['segments'])} segments")

    return transcripts

# 🟢 Run this on your extracted audio directory
audio_dir = "./output_audio"
all_transcriptions = transcribe_all_audio(audio_dir)


100%|███████████████████████████████████████| 139M/139M [00:04<00:00, 32.7MiB/s]
  checkpoint = torch.load(fp, map_location=device)


🔍 Transcribing: Experimenter_CREW_999_1_All_1731617801.wav ...
✅ Transcribed: Experimenter_CREW_999_1_All_1731617801.wav → 78 segments


### **Print the first 5 transcription segments from the first audio file**

In [9]:
first_audio_file = list(all_transcriptions.keys())[0]  # Get the first file name
first_five_segments = all_transcriptions[first_audio_file][:5]  # First 5 segments

for i, segment in enumerate(first_five_segments, 1):
    print(f"Segment {i}:")
    print(f"Start: {segment['start']}s")
    print(f"End: {segment['end']}s")
    print(f"Text: {segment['text']}")
    print("-" * 40)

Segment 1:
Start: 0.0s
End: 13.36s
Text:  Okay, so the drive you're going to complete and use the E-cautomation and the object detection
----------------------------------------
Segment 2:
Start: 13.36s
End: 17.400000000000002s
Text:  system so that it will not be to operate the vehicle, so keep your hands off this steering
----------------------------------------
Segment 3:
Start: 17.400000000000002s
End: 19.88s
Text:  wheel and meet off the pedals and punch me in that drive.
----------------------------------------
Segment 4:
Start: 19.88s
End: 20.88s
Text:  Okay.
----------------------------------------
Segment 5:
Start: 20.88s
End: 23.8s
Text:  So when you see that some driver in the caterer highlight green, make sure you don't get
----------------------------------------


# **Step 3: Speaker Diarization (Using Local Pyannote Pretrained Model)** 
#### * Use a pretrained diarization pipeline from pyannote.audio (e.g., pyannote/speaker-diarization).
#### * Run speaker diarization on each .wav file.
#### * Assign speaker labels (e.g., Speaker 0, Speaker 1) to time-stamped segments.
#### * Merge speaker info with Whisper transcription based on time overlap.

In [10]:
from huggingface_hub import login
login(token="HUGGING FACE TOKEN")

In [11]:
from pyannote.audio import Pipeline
from huggingface_hub import login
from datetime import timedelta

 # Replace with your actual token

# Load pretrained speaker diarization pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=True)

def diarize_audio_pyannote(audio_path):
    """
    Perform speaker diarization using pyannote and return segments with speaker info.
    
    Parameters:
        audio_path (str): Path to the WAV audio file.

    Returns:
        list of dicts: Speaker segments with start_time, end_time, speaker label.
    """
    diarization = pipeline(audio_path, num_speakers=15)  # 💡 Force 15 speakers

    segments = []
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        segments.append({
            "start": turn.start,
            "end": turn.end,
            "speaker": speaker
        })
    return segments
def assign_speakers_to_transcripts(whisper_segments, speaker_segments):
    """
    Match Whisper transcription segments to speaker segments by overlap.

    Parameters:
        whisper_segments (list): List of Whisper segments with 'start', 'end', 'text'.
        speaker_segments (list): List of diarization segments with 'start', 'end', 'speaker'.

    Returns:
        list of dicts: Each dict contains transcription + assigned speaker.
    """
    enriched = []
    for ws in whisper_segments:
        speaker = "Unknown"
        for ss in speaker_segments:
            if ss["start"] <= ws["start"] < ss["end"]:
                speaker = ss["speaker"]
                break
        enriched.append({
            "start_time": round(ws["start"], 2),
            "end_time": round(ws["end"], 2),
            "text": ws["text"],
            "speaker": speaker
        })
    return enriched
all_transcripts_with_speakers = {}

for file in os.listdir(audio_dir):
    if file.endswith(".wav"):
        audio_path = os.path.join(audio_dir, file)
        base_name = os.path.splitext(file)[0]

        print(f"🎙️ Running diarization for {file}...")
        speaker_segs = diarize_audio_pyannote(audio_path)

        whisper_segs = all_transcriptions[file]  # From Step 2
        enriched = assign_speakers_to_transcripts(whisper_segs, speaker_segs)

        all_transcripts_with_speakers[base_name] = enriched
        print("-" * 60)

        for i, line in enumerate(enriched[:5], start=1):
            print(f"{i}. [{line['speaker']}] {line['text']} (Start: {line['start_time']}s, End: {line['end_time']}s)")

        print("-" * 60)

config.yaml:   0%|          | 0.00/500 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/318 [00:00<?, ?B/s]

Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.5.1+cu121. Bad things might happen unless you revert torch to 1.x.


hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

  state_dict = torch.load(path, map_location=device)
  stats = torch.load(path, map_location=device)


🎙️ Running diarization for Experimenter_CREW_999_1_All_1731617801.wav...
Found only 5 clusters. Using a smaller value than 15 for `min_cluster_size` might help.


The detected number of speakers (5) is outside
the given bounds [15, 15]. This can happen if the
given audio file is too short to contain 15 or more speakers.
Try to lower the desired minimal number of speakers.



✅ Speaker assignment complete for Experimenter_CREW_999_1_All_1731617801.wav
{'start_time': 0.0, 'end_time': 13.36, 'text': " Okay, so the drive you're going to complete and use the E-cautomation and the object detection", 'speaker': 'Unknown'}
{'start_time': 13.36, 'end_time': 17.4, 'text': ' system so that it will not be to operate the vehicle, so keep your hands off this steering', 'speaker': 'SPEAKER_02'}
{'start_time': 17.4, 'end_time': 19.88, 'text': ' wheel and meet off the pedals and punch me in that drive.', 'speaker': 'SPEAKER_02'}
{'start_time': 19.88, 'end_time': 20.88, 'text': ' Okay.', 'speaker': 'SPEAKER_02'}
{'start_time': 20.88, 'end_time': 23.8, 'text': " So when you see that some driver in the caterer highlight green, make sure you don't get", 'speaker': 'SPEAKER_02'}


In [13]:
print("-" * 60)
for i, line in enumerate(enriched[:5], start=1):
    print(f"{i}. [{line['speaker']}] {line['text']} (Start: {line['start_time']}s, End: {line['end_time']}s)")

print("-" * 60)

------------------------------------------------------------
1. [Unknown]  Okay, so the drive you're going to complete and use the E-cautomation and the object detection (Start: 0.0s, End: 13.36s)
2. [SPEAKER_02]  system so that it will not be to operate the vehicle, so keep your hands off this steering (Start: 13.36s, End: 17.4s)
3. [SPEAKER_02]  wheel and meet off the pedals and punch me in that drive. (Start: 17.4s, End: 19.88s)
4. [SPEAKER_02]  Okay. (Start: 19.88s, End: 20.88s)
5. [SPEAKER_02]  So when you see that some driver in the caterer highlight green, make sure you don't get (Start: 20.88s, End: 23.8s)
------------------------------------------------------------


# **Step 4: Bucket Transcriptions by Time**
#### * Organize transcribed text into 5-second time buckets.
#### * Each segment is assigned to a bucket_start and bucket_end (e.g., 0–5 sec, 5–10 sec).
#### * Store all segments with their time buckets, speakers, and texts.

In [16]:
import math

def assign_buckets(transcript_segments, bucket_size=5):
    """
    Adds 5-second time bucket information to each transcription segment.

    Parameters:
        transcript_segments (list): List of transcription segments with 'start_time' and 'end_time'.
        bucket_size (int): Size of each time bucket in seconds (default: 5).

    Returns:
        list of dicts: Updated segments with bucket_start and bucket_end.
    """
    for segment in transcript_segments:
        start_bucket = int(math.floor(segment["start_time"] / bucket_size)) * bucket_size
        end_bucket = start_bucket + bucket_size
        segment["bucket_start"] = start_bucket
        segment["bucket_end"] = end_bucket
    return transcript_segments
# Dictionary to store the bucketed transcriptions
bucketed_transcripts = {}
# Assign buckets to all transcription segments
for file_name, segments in all_transcripts_with_speakers.items():
    print(f"Assigning buckets for: {file_name}")
    
    # Assign buckets to the segments
    updated_segments = assign_buckets(segments)
    bucketed_transcripts[file_name] = updated_segments
    
    # Print the first 5 segments 
    print(f"Buckets assigned. First 5 entries:")
    for i, segment in enumerate(updated_segments[:5], start=1):
        print(f"{i}. [Speaker: {segment['speaker']}] {segment['text']} (Start: {segment['start_time']}s, End: {segment['end_time']}s) → Bucket: {segment['bucket_start']}-{segment['bucket_end']}s")

    print("-" * 60)


Assigning buckets for: Experimenter_CREW_999_1_All_1731617801
Buckets assigned. First 5 entries:
1. [Speaker: Unknown]  Okay, so the drive you're going to complete and use the E-cautomation and the object detection (Start: 0.0s, End: 13.36s) → Bucket: 0-5s
2. [Speaker: SPEAKER_02]  system so that it will not be to operate the vehicle, so keep your hands off this steering (Start: 13.36s, End: 17.4s) → Bucket: 10-15s
3. [Speaker: SPEAKER_02]  wheel and meet off the pedals and punch me in that drive. (Start: 17.4s, End: 19.88s) → Bucket: 15-20s
4. [Speaker: SPEAKER_02]  Okay. (Start: 19.88s, End: 20.88s) → Bucket: 15-20s
5. [Speaker: SPEAKER_02]  So when you see that some driver in the caterer highlight green, make sure you don't get (Start: 20.88s, End: 23.8s) → Bucket: 20-25s
------------------------------------------------------------


# **Step 5: Sentiment Analysis (Using NLTK VADER)**
#### * For each transcribed segment, analyze sentiment using the VADER sentiment analyzer.
#### * Assign a label: positive, negative, or neutral based on compound score.
#### * Add sentiment to the segment metadata.

In [17]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download NLTK VADER lexicon (only needed once)
nltk.download("vader_lexicon")

# Initialize SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    """
    Analyzes sentiment of a given text using VADER SentimentIntensityAnalyzer.

    Parameters:
        text (str): The text to analyze.

    Returns:
        str: Sentiment label: "positive", "negative", or "neutral".
    """
    # Analyze sentiment
    sentiment_score = sia.polarity_scores(text)["compound"]

    # Assign sentiment based on the compound score
    if sentiment_score >= 0.05:
        return "positive"
    elif sentiment_score <= -0.05:
        return "negative"
    else:
        return "neutral"

def apply_sentiment_to_segments(transcript_segments):
    """
    Applies sentiment analysis to each transcription segment.

    Parameters:
        transcript_segments (list): List of transcription segments with 'text' and other info.

    Returns:
        list: Updated transcription segments with sentiment.
    """
    for segment in transcript_segments:
        segment["sentiment"] = analyze_sentiment(segment["text"])
    return transcript_segments

# Dictionary to store sentiment-analyzed transcripts
sentimented_transcripts = {}

# Apply sentiment analysis for all files
for file_name, segments in bucketed_transcripts.items():
    print(f" Analyzing sentiment for: {file_name}")
    
    # Apply sentiment analysis to segments
    updated_segments_with_sentiment = apply_sentiment_to_segments(segments)
    sentimented_transcripts[file_name] = updated_segments_with_sentiment
    
    # Print the first 5 segments
    print(f"Sentiment analysis complete. First 5 segments:")
    
    for i, entry in enumerate(updated_segments_with_sentiment[:5], start=1):
        print(f"{i}. [Speaker: {entry['speaker']}] {entry['text']} (Sentiment: {entry['sentiment']})")
        print(f"   Start Time: {entry['start_time']}s, End Time: {entry['end_time']}s")
        print(f"   Bucket: {entry['bucket_start']} - {entry['bucket_end']}s")
    print("-" * 60)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
 Analyzing sentiment for: Experimenter_CREW_999_1_All_1731617801
Sentiment analysis complete. First 5 segments:
1. [Speaker: Unknown]  Okay, so the drive you're going to complete and use the E-cautomation and the object detection (Sentiment: positive)
   Start Time: 0.0s, End Time: 13.36s
   Bucket: 0 - 5s
2. [Speaker: SPEAKER_02]  system so that it will not be to operate the vehicle, so keep your hands off this steering (Sentiment: neutral)
   Start Time: 13.36s, End Time: 17.4s
   Bucket: 10 - 15s
3. [Speaker: SPEAKER_02]  wheel and meet off the pedals and punch me in that drive. (Sentiment: neutral)
   Start Time: 17.4s, End Time: 19.88s
   Bucket: 15 - 20s
4. [Speaker: SPEAKER_02]  Okay. (Sentiment: positive)
   Start Time: 19.88s, End Time: 20.88s
   Bucket: 15 - 20s
5. [Speaker: SPEAKER_02]  So when you see that some driver in the cat

# **Step 6: Named Entity Recognition (NER)**
#### * Use a pre-trained Hugging Face model (e.g., dslim/bert-base-NER).
#### * Detect named entities (people, organizations, places) in each transcription segment.
#### * Store named entities alongside their corresponding segment.

In [18]:
# Import necessary libraries
from transformers import pipeline

# Initialize the Hugging Face NER pipeline
ner_model = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

# Function to extract named entities from text
def extract_named_entities(text):
    """
    Extracts named entities from the given text using a pre-trained NER model.

    Parameters:
        text (str): The input text to analyze.

    Returns:
        list: A list of named entities found in the text.
    """
    ner_results = ner_model(text)
    # Extract words (entities) from the NER output
    named_entities = [result['word'] for result in ner_results]
    return named_entities

# Function to apply NER to each transcription segment
def apply_ner_to_segments(transcript_segments):
    """
    Applies Named Entity Recognition (NER) to each transcription segment.

    Parameters:
        transcript_segments (list): List of transcription segments.

    Returns:
        list: Updated transcription segments with named entities.
    """
    for segment in transcript_segments:
        # Apply NER to each segment's text
        segment["named_entities"] = extract_named_entities(segment["text"])
    return transcript_segments

# Example usage
# Assuming `sentimented_transcripts` is the dictionary containing all transcribed segments along with sentiment
nered_transcripts = {}

for file_name, segments in sentimented_transcripts.items():
    print(f"🔍 Extracting named entities for: {file_name}")
    updated_segments_with_ner = apply_ner_to_segments(segments)
    nered_transcripts[file_name] = updated_segments_with_ner
    print(f"✅ NER complete for {file_name}. First 3 segments with named entities:")
    
    # Display the first 3 segments with named entities
    for entry in updated_segments_with_ner[:3]:
        print(entry)
for file_name, segments in nered_transcripts.items():
    print(f"\n📄 First 5 segments with Named Entities from {file_name}:")
    for i, entry in enumerate(segments[:5], start=1):
        print(f"{i}. [Speaker: {entry['speaker']}] {entry['text']} (Sentiment: {entry['sentiment']})")
        print(f"   Named Entities: {', '.join(entry['named_entities'])}")
        print(f"   Start Time: {entry['start_time']}s, End Time: {entry['end_time']}s")
        print(f"   Bucket: {entry['bucket_start']} - {entry['bucket_end']}s")
    print("-" * 60)


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Device set to use cuda:0


🔍 Extracting named entities for: Experimenter_CREW_999_1_All_1731617801


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


✅ NER complete for Experimenter_CREW_999_1_All_1731617801. First 3 segments with named entities:
{'start_time': 0.0, 'end_time': 13.36, 'text': " Okay, so the drive you're going to complete and use the E-cautomation and the object detection", 'speaker': 'Unknown', 'bucket_start': 0, 'bucket_end': 5, 'sentiment': 'positive', 'named_entities': []}
{'start_time': 13.36, 'end_time': 17.4, 'text': ' system so that it will not be to operate the vehicle, so keep your hands off this steering', 'speaker': 'SPEAKER_02', 'bucket_start': 10, 'bucket_end': 15, 'sentiment': 'neutral', 'named_entities': []}
{'start_time': 17.4, 'end_time': 19.88, 'text': ' wheel and meet off the pedals and punch me in that drive.', 'speaker': 'SPEAKER_02', 'bucket_start': 15, 'bucket_end': 20, 'sentiment': 'neutral', 'named_entities': []}

📄 First 5 segments with Named Entities from Experimenter_CREW_999_1_All_1731617801:
1. [Speaker: Unknown]  Okay, so the drive you're going to complete and use the E-cautomation and

# **Step 7: Export Results to CSV**
#### * Combine all enriched segment data into rows (start time, end time, speaker, text, sentiment, named entities, etc.).
#### * Export the final dataset into a structured .csv file.

In [20]:
import pandas as pd

def export_to_csv(nered_transcripts, output_file):
    """
    Exports the transcription data with NER and sentiment information into a CSV file.

    Parameters:
        nered_transcripts (dict): Dictionary of transcriptions with sentiment and NER data.
        output_file (str): Path to the output CSV file where data will be saved.
    """
    rows = []

    # Iterate through all transcriptions and extract relevant data
    for file_name, segments in nered_transcripts.items():
        for segment in segments:
            row = {
                "start_time": segment["start_time"],
                "end_time": segment["end_time"],
                "bucket_start": int(segment["start_time"] // 5) * 5,  # 5-second bucket start
                "bucket_end": (int(segment["start_time"] // 5) + 1) * 5,  # 5-second bucket end
                "text": segment["text"],
                "sentiment": segment["sentiment"],
                "named_entities": ", ".join(segment["named_entities"]),  # Join named entities in a string
                "word_count": len(segment["text"].split()),  # Count words in the segment
                "speaker": segment["speaker"]
            }
            rows.append(row)

    # Convert the rows to a DataFrame
    df = pd.DataFrame(rows)

    # Export the DataFrame to a CSV file
    df.to_csv(output_file, index=False)
    print(f"✅ Data has been successfully exported to {output_file}")

# Example usage
output_csv_path = "transcriptions_with_ner_and_sentiment.csv"
export_to_csv(nered_transcripts, output_csv_path)


✅ Data has been successfully exported to transcriptions_with_ner_and_sentiment.csv
