In [11]:
import nltk
nltk.download('vader_lexicon')  # Required for sentiment analysis
nltk.download('punkt')          # Tokenizer for text processing

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:

def process_videos_pipeline(
    input_video_dir="input video directory path",
    huggingface_token="Your hugging face api",
    output_audio_dir="output audio directiry path if any",
    output_csv_dir="output transcripted result directory path",
    whisper_model_size="base",
    bucket_size=5,#secofbucket
    max_speakers=15 #maxspeakers
):
    import os
    import subprocess
    import math
    import pandas as pd
    import nltk
    from nltk.sentiment import SentimentIntensityAnalyzer
    import whisper
    from transformers import pipeline
    from pyannote.audio import Pipeline as DiarizationPipeline
    from huggingface_hub import login

    nltk.download('vader_lexicon')
    nltk.download('punkt')

    # --- Helper Functions ---
    def extract_audio(video_path, output_audio_path):
        cmd = [
            "ffmpeg", "-i", video_path, "-q:a", "0", "-map", "a",
            "-ac", "1", "-ar", "16000", output_audio_path, "-y"
        ]
        subprocess.run(cmd, capture_output=True, text=True)
        return output_audio_path

    def process_all_videos():
        os.makedirs(output_audio_dir, exist_ok=True)
        for filename in os.listdir(input_video_dir):
            if filename.lower().endswith((".mp4", ".mov", ".avi", ".mkv")):
                video_path = os.path.join(input_video_dir, filename)
                output_path = os.path.join(output_audio_dir, f"{os.path.splitext(filename)[0]}.wav")
                extract_audio(video_path, output_path)
                print(f"✅ Extracted: {filename} → {output_path}")

    def transcribe_all_audio():
        model = whisper.load_model(whisper_model_size)
        transcripts = {}
        for file in os.listdir(output_audio_dir):
            if file.endswith(".wav"):
                path = os.path.join(output_audio_dir, file)
                print(f"🔍 Transcribing {file}...")
                result = model.transcribe(path)
                transcripts[file] = result["segments"]
        return transcripts

    def diarize_audio(audio_path, diar_pipeline):
        diarization = diar_pipeline(audio_path, num_speakers=max_speakers)
        return [
            {"start": turn.start, "end": turn.end, "speaker": speaker}
            for turn, _, speaker in diarization.itertracks(yield_label=True)
        ]

    def assign_speakers_to_transcripts(whisper_segs, speaker_segs):
        enriched = []
        for ws in whisper_segs:
            speaker = "Unknown"
            for ss in speaker_segs:
                if ss["start"] <= ws["start"] < ss["end"]:
                    speaker = ss["speaker"]
                    break
            enriched.append({
                "start_time": round(ws["start"], 2),
                "end_time": round(ws["end"], 2),
                "text": ws["text"],
                "speaker": speaker
            })
        return enriched

    def assign_buckets(segments):
        for seg in segments:
            start = int(math.floor(seg["start_time"] / bucket_size)) * bucket_size
            seg["bucket_start"] = start
            seg["bucket_end"] = start + bucket_size
        return segments

    def analyze_sentiment(text):
        score = sia.polarity_scores(text)["compound"]
        return "positive" if score >= 0.05 else "negative" if score <= -0.05 else "neutral"

    def apply_sentiment(segments):
        for seg in segments:
            seg["sentiment"] = analyze_sentiment(seg["text"])
        return segments

    def extract_named_entities(text):
        return [entity["word"] for entity in ner_pipeline(text)]

    def apply_ner(segments):
        for seg in segments:
            seg["named_entities"] = extract_named_entities(seg["text"])
        return segments

    def export_per_video_csvs(nered_transcripts, output_dir=output_csv_dir):
        os.makedirs(output_dir, exist_ok=True)
        for file_name, segments in nered_transcripts.items():
            rows = []
            for segment in segments:
                row = {
                    "start_time": segment["start_time"],
                    "end_time": segment["end_time"],
                    "bucket_start": int(segment["start_time"] // bucket_size) * bucket_size,
                    "bucket_end": (int(segment["start_time"] // bucket_size) + 1) * bucket_size,
                    "text": segment["text"],
                    "sentiment": segment["sentiment"],
                    "named_entities": ", ".join(segment["named_entities"]),
                    "word_count": len(segment["text"].split()),
                    "speaker": segment["speaker"]
                }
                rows.append(row)

            df = pd.DataFrame(rows)
            output_path = os.path.join(output_dir, f"{file_name}.csv")
            df.to_csv(output_path, index=False)
            print(f"✅ Exported: {output_path}")

    # --- Start Pipeline ---
    process_all_videos()
    transcriptions = transcribe_all_audio()

    login(token=huggingface_token)
    diar_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=True)

    sia = SentimentIntensityAnalyzer()
    ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

    all_data = {}

    for fname, whisper_segs in transcriptions.items():
        print(f"🎙 Processing file: {fname}")
        audio_path = os.path.join(output_audio_dir, fname)
        speaker_segs = diarize_audio(audio_path, diar_pipeline)
        enriched = assign_speakers_to_transcripts(whisper_segs, speaker_segs)
        enriched = assign_buckets(enriched)
        enriched = apply_sentiment(enriched)
        enriched = apply_ner(enriched)
        all_data[os.path.splitext(fname)[0]] = enriched
        
    export_per_video_csvs(all_data, output_csv_dir)

In [4]:
process_videos_pipeline()



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
✅ Extracted: Experimenter_CREW_999_1_All_1731617801.mp4 → ./output_audio/Experimenter_CREW_999_1_All_1731617801.wav


100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 180MiB/s]
  checkpoint = torch.load(fp, map_location=device)


🔍 Transcribing Experimenter_CREW_999_1_All_1731617801.wav...


config.yaml:   0%|          | 0.00/500 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/318 [00:00<?, ?B/s]

Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.5.1+cu121. Bad things might happen unless you revert torch to 1.x.


hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

  state_dict = torch.load(path, map_location=device)
  stats = torch.load(path, map_location=device)


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Device set to use cuda:0


🎙 Processing file: Experimenter_CREW_999_1_All_1731617801.wav
Found only 5 clusters. Using a smaller value than 15 for `min_cluster_size` might help.


The detected number of speakers (5) is outside
the given bounds [15, 15]. This can happen if the
given audio file is too short to contain 15 or more speakers.
Try to lower the desired minimal number of speakers.

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


✅ Exported: ./csv_outputs/Experimenter_CREW_999_1_All_1731617801.csv
