In [None]:
!rm -rf ./offensive_word_nlp_model
!rm -rf ./offensive_word_nlp_model_final
!rm -rf ./logs

In [None]:
# Install required libraries
!pip install transformers datasets evaluate scikit-learn torch nltk pandas tqdm

import pandas as pd
import numpy as np
import torch
import nltk
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import load_dataset, Dataset
import evaluate
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import os

# Fix 1: More robust NLTK download
print("Downloading NLTK resources...")
try:
    # Create nltk_data directory if it doesn't exist
    nltk_data_dir = os.path.expanduser('~/nltk_data')
    os.makedirs(nltk_data_dir, exist_ok=True)

    # Download punkt to the specified directory
    nltk.download('punkt', download_dir=nltk_data_dir)

    # Test the tokenizer
    from nltk.tokenize import word_tokenize
    test = word_tokenize("Test sentence.")
    print("NLTK punkt tokenizer successfully loaded.")
except Exception as e:
    print(f"Error with NLTK download: {e}")
    print("Implementing a simple fallback tokenizer...")
    # Simple fallback tokenizer function
    def word_tokenize(text):
        # Remove punctuation that's connected to words
        for punct in ".,!?;:'\")]}-_":
            text = text.replace(punct, ' ')
        for punct in "[({\"-_":
            text = text.replace(punct, ' ')
        # Split by whitespace
        return text.split()

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

print("Setting up NLP-based offensive word detection system...")

# 1. Load and prepare the dataset
print("Loading dataset...")
dataset = load_dataset("hate_speech_offensive")
print(f"Dataset loaded: {dataset}")

# 2. Prepare word-level training data
def create_word_level_dataset():
    """Create a dataset of individual words with offensive/non-offensive labels"""
    print("Creating word-level dataset...")

    # Extract words and their context from the dataset
    all_words = []
    word_contexts = []
    word_labels = []

    # Process each tweet
    for example in tqdm(dataset["train"]):
        tweet = example["tweet"]
        # 0 = hate speech, 1 = offensive language, 2 = neither
        is_offensive = 1 if example["class"] in [0, 1] else 0

        # Tokenize the tweet
        words = word_tokenize(tweet.lower())

        # For each word, create a context window (the word with surrounding words)
        for i, word in enumerate(words):
            if len(word) <= 2 or not word.isalpha():
                continue

            # Get context (up to 3 words before and after)
            start = max(0, i - 3)
            end = min(len(words), i + 4)
            context = " ".join(words[start:end])

            all_words.append(word)
            word_contexts.append(context)
            word_labels.append(is_offensive)

    # Create a DataFrame
    word_df = pd.DataFrame({
        "word": all_words,
        "context": word_contexts,
        "offensive": word_labels
    })

    # Calculate word frequency and offensive ratio
    word_stats = word_df.groupby("word").agg({
        "offensive": ["count", "mean"]
    }).reset_index()
    word_stats.columns = ["word", "count", "offensive_ratio"]

    # Filter for words that appear at least 5 times
    filtered_words = word_stats[word_stats["count"] >= 5]["word"].tolist()
    filtered_df = word_df[word_df["word"].isin(filtered_words)]

    # Create balanced dataset by sampling
    offensive_samples = filtered_df[filtered_df["offensive"] == 1]
    non_offensive_samples = filtered_df[filtered_df["offensive"] == 0]

    # Balance the dataset if needed
    if len(offensive_samples) < len(non_offensive_samples):
        non_offensive_samples = non_offensive_samples.sample(
            n=len(offensive_samples) * 2,
            random_state=42
        )
    else:
        offensive_samples = offensive_samples.sample(
            n=len(non_offensive_samples) // 2,
            random_state=42
        )

    # Combine and shuffle
    balanced_df = pd.concat([offensive_samples, non_offensive_samples])
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

    print(f"Created word-level dataset with {len(balanced_df)} samples")
    print(f"Offensive samples: {len(balanced_df[balanced_df['offensive'] == 1])}")
    print(f"Non-offensive samples: {len(balanced_df[balanced_df['offensive'] == 0])}")

    return balanced_df

# Create the word-level dataset
word_dataset_df = create_word_level_dataset()

# Split into train and evaluation sets
train_df, eval_df = train_test_split(
    word_dataset_df, test_size=0.2, random_state=42
)

print(f"Training data: {len(train_df)} samples")
print(f"Evaluation data: {len(eval_df)} samples")

# Convert to Datasets format
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# 3. Load a pre-trained model and tokenizer
print("Setting up pre-trained model...")
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Fix 2: Move model to the correct device
model = model.to(device)

# 4. Prepare the data for fine-tuning
def preprocess_function(examples):
    # Use the context for better performance
    return tokenizer(examples["context"], truncation=True, padding="max_length", max_length=64)

# Apply tokenization
print("Tokenizing dataset...")
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)

# 5. Define metrics for evaluation
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# 6. Define training arguments
training_args = TrainingArguments(
    output_dir="./offensive_word_nlp_model",
    report_to="none",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs",
)

# 7. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train.select_columns(["input_ids", "attention_mask", "offensive"]).rename_column("offensive", "labels"),
    eval_dataset=tokenized_eval.select_columns(["input_ids", "attention_mask", "offensive"]).rename_column("offensive", "labels"),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 8. Fine-tune the model
print("Starting NLP model fine-tuning...")
trainer.train()

# 9. Save the model
model_path = "./offensive_word_nlp_model_final"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"NLP model saved to: {model_path}")

# 10. Create a function to detect offensive words using the NLP model - Fixed with device handling
def detect_offensive_words_nlp(text, model, tokenizer, threshold=0.7):
    """
    Detect offensive words in text using the NLP model

    Args:
        text: Text to analyze
        model: Fine-tuned offensive language detection model
        tokenizer: Tokenizer for the model
        threshold: Confidence threshold to consider a word offensive

    Returns:
        List of offensive words and their scores
    """
    # Ensure model is in evaluation mode
    model.eval()

    # Get device that model is on
    model_device = next(model.parameters()).device

    # Tokenize the text
    words = word_tokenize(text.lower())

    # Filter out non-alphabetic tokens and short words
    words = [word for word in words if word.isalpha() and len(word) > 2]

    # Process each word with context
    results = []
    for i, word in enumerate(words):
        # Get context (up to 3 words before and after)
        start = max(0, i - 3)
        end = min(len(words), i + 4)
        context = " ".join(words[start:end])

        # Use model to predict if the word is offensive in this context
        inputs = tokenizer(context, return_tensors="pt")

        # Fix: Move inputs to the same device as the model
        inputs = {k: v.to(model_device) for k, v in inputs.items()}

        with torch.no_grad():
            logits = model(**inputs).logits

        # Get prediction score (0 = not offensive, 1 = offensive)
        scores = torch.softmax(logits, dim=1)[0].tolist()
        offensive_score = scores[1]  # Probability of being offensive

        if offensive_score >= threshold:
            results.append({
                "word": word,
                "context": context,
                "confidence": offensive_score
            })

    return results

# 11. Test the NLP-based detection
print("\nTesting the NLP model with a sample text...")
sample_text = "This damn stuff is terrible and pisses me off. I can't stand this shit."

detected = detect_offensive_words_nlp(sample_text, model, tokenizer)

print(f"Found {len(detected)} potentially offensive words:")
for item in detected:
    print(f"- '{item['word']}' (confidence: {item['confidence']:.2f})")
    print(f"  Context: \"{item['context']}\"")

Downloading NLTK resources...
Error with NLTK download: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

Implementing a simple fallback tokenizer...
Using device: cuda
Setting up NLP-based offensive word detection system...
Loading dataset...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['count', 'hate_speech_count', 'offensive_language_count', 'neither_count', 'class', 'tweet'],
        num_rows: 24783
    })
})
Creating word-level dataset...


  0%|          | 0/24783 [00:00<?, ?it/s]

Created word-level dataset with 55011 samples
Offensive samples: 18337
Non-offensive samples: 36674
Training data: 44008 samples
Evaluation data: 11003 samples
Setting up pre-trained model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing dataset...


Map:   0%|          | 0/44008 [00:00<?, ? examples/s]

Map:   0%|          | 0/11003 [00:00<?, ? examples/s]

  trainer = Trainer(


Starting NLP model fine-tuning...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.315,0.296868,0.872853
2,0.2153,0.304487,0.887758
3,0.1606,0.383529,0.891393


NLP model saved to: ./offensive_word_nlp_model_final

Testing the NLP model with a sample text...
Found 7 potentially offensive words:
- 'stuff' (confidence: 0.88)
  Context: "this damn stuff terrible and pisses"
- 'terrible' (confidence: 0.84)
  Context: "this damn stuff terrible and pisses off"
- 'and' (confidence: 0.79)
  Context: "damn stuff terrible and pisses off can"
- 'can' (confidence: 0.99)
  Context: "and pisses off can stand this shit"
- 'stand' (confidence: 0.99)
  Context: "pisses off can stand this shit"
- 'this' (confidence: 0.99)
  Context: "off can stand this shit"
- 'shit' (confidence: 0.99)
  Context: "can stand this shit"


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def test_offensive_word_detector():
    """Test if the NLP offensive word detector model works as expected"""
    print("Testing the NLP offensive word detector...")

    # 1. Load the saved model and tokenizer
    try:
        model_path = "./offensive_word_nlp_model_final"
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSequenceClassification.from_pretrained(model_path)
        print("✓ Model and tokenizer loaded successfully")
    except Exception as e:
        print(f"✗ Failed to load model: {e}")
        return False

    # 2. Detect device and move model
    try:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = model.to(device)
        print(f"✓ Model moved to device: {device}")
    except Exception as e:
        print(f"✗ Failed to move model to device: {e}")
        return False

    # 3. Define test function
    def detect_offensive_words(text, threshold=0.7):
        """Test function to detect offensive words"""
        model.eval()
        model_device = next(model.parameters()).device

        # Simple word tokenization
        words = text.lower().split()

        results = []
        for word in words:
            if len(word) <= 2 or not any(c.isalpha() for c in word):
                continue

            # Context is just the word itself for this simple test
            context = word

            # Use model to predict
            inputs = tokenizer(context, return_tensors="pt")
            inputs = {k: v.to(model_device) for k, v in inputs.items()}

            with torch.no_grad():
                logits = model(**inputs).logits

            scores = torch.softmax(logits, dim=1)[0].tolist()
            offensive_score = scores[1]  # Probability of being offensive

            results.append({
                "word": word,
                "confidence": offensive_score,
                "is_offensive": offensive_score >= threshold
            })

        return results

    # 4. Test with various sentences
    test_sentences = [
        "This is a normal sentence without any offensive words.",
        "This damn thing is really annoying me.",
        "I love this fucking bitch and these wonderful flowers.",
        "That stupid idiot made me so angry I could scream.",
    ]

    # 5. Run tests and report results
    print("\n--- Testing with sample sentences ---")
    all_tests_passed = True

    for sentence in test_sentences:
        print(f"\nTesting: \"{sentence}\"")
        try:
            results = detect_offensive_words(sentence)
            print("Results:")
            for result in results:
                status = "OFFENSIVE" if result["is_offensive"] else "clean"
                print(f"- '{result['word']}': {result['confidence']:.2f} ({status})")
            print("✓ Sentence processed successfully")
        except Exception as e:
            print(f"✗ Failed to process sentence: {e}")
            all_tests_passed = False

    # 6. Final status
    if all_tests_passed:
        print("\n✅ All tests completed successfully! The model is working.")
        return True
    else:
        print("\n❌ Some tests failed. Check the errors above.")
        return False

# Run the test
if __name__ == "__main__":
    test_offensive_word_detector()

Testing the NLP offensive word detector...
✓ Model and tokenizer loaded successfully
✓ Model moved to device: cuda

--- Testing with sample sentences ---

Testing: "This is a normal sentence without any offensive words."
Results:
- 'this': 0.34 (clean)
- 'normal': 0.24 (clean)
- 'sentence': 0.32 (clean)
- 'without': 0.34 (clean)
- 'any': 0.35 (clean)
- 'offensive': 0.04 (clean)
- 'words.': 0.35 (clean)
✓ Sentence processed successfully

Testing: "This damn thing is really annoying me."
Results:
- 'this': 0.34 (clean)
- 'damn': 0.89 (OFFENSIVE)
- 'thing': 0.24 (clean)
- 'really': 0.34 (clean)
- 'annoying': 0.32 (clean)
- 'me.': 0.45 (clean)
✓ Sentence processed successfully

Testing: "I love this fucking bitch and these wonderful flowers."
Results:
- 'love': 0.36 (clean)
- 'this': 0.34 (clean)
- 'fucking': 0.99 (OFFENSIVE)
- 'bitch': 1.00 (OFFENSIVE)
- 'and': 0.31 (clean)
- 'these': 0.34 (clean)
- 'wonderful': 0.07 (clean)
- 'flowers.': 0.11 (clean)
✓ Sentence processed successfully

Te

In [None]:
# Part 1: In your notebook environment - Create a ZIP archive of the model
import os
import shutil

def prepare_model_for_download():
    """
    Prepares the offensive word NLP model for download by creating a ZIP archive
    """
    model_path = "./offensive_word_nlp_model_final"
    zip_path = "./offensive_word_model.zip"

    # Check if model exists
    if not os.path.exists(model_path):
        print(f"Error: Model directory '{model_path}' not found. Make sure the model has been trained.")
        return False

    try:
        # Create a ZIP archive of the model directory
        print(f"Creating ZIP archive of model at {zip_path}...")
        shutil.make_archive("offensive_word_model", 'zip', model_path)

        print(f"✅ Model archive created successfully: {zip_path}")
        print("You can now download this ZIP file to your local machine.")

        # If running in Google Colab, display download link
        try:
            from google.colab import files
            print("Download link ready. Click the link below:")
            files.download(zip_path)
        except ImportError:
            print("Not running in Google Colab. Use your platform's file download option to download the ZIP file.")

        return True
    except Exception as e:
        print(f"❌ Error creating ZIP archive: {e}")
        return False

# Execute the function to prepare the model for download
prepare_model_for_download()

Creating ZIP archive of model at ./offensive_word_model.zip...
✅ Model archive created successfully: ./offensive_word_model.zip
You can now download this ZIP file to your local machine.
Download link ready. Click the link below:


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

True

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from nltk.tokenize import word_tokenize
import nltk
import re
import os

nltk.download('punkt_tab')

# Ensure NLTK punkt tokenizer data is downloaded
try:
    nltk.download('punkt', quiet=False)  # Set quiet=False to see detailed output
    # Verify punkt data location
    punkt_path = nltk.data.find('tokenizers/punkt')
    print(f"Punkt data location: {punkt_path}")

    # Check for the english.pickle file
    punkt_tab_path = os.path.join(punkt_path, "english.pickle")
    if os.path.exists(punkt_tab_path):
        print("punkt_tab file found!")
    else:
        print("punkt_tab file not found! Manually downloading and installing...")
        # Manually download punkt data
        nltk.download('punkt', download_dir='/root/nltk_data')
        print("punkt data downloaded and installed successfully!")
except Exception as e:
    print(f"Error downloading or verifying NLTK punkt data: {e}")
    raise

# Manually load the punkt tokenizer
try:
    from nltk.tokenize import PunktSentenceTokenizer
    punkt_tokenizer = PunktSentenceTokenizer()
    print("Punkt tokenizer loaded successfully!")
except Exception as e:
    print(f"Error loading punkt tokenizer: {e}")
    raise

class WhisperTranscriptProcessor:
    def __init__(self, model_path, threshold=0.7):
        """
        Initialize the transcript processor with the offensive word detection model

        Args:
            model_path: Path to the trained offensive word detection model
            threshold: Confidence threshold for offensive word detection
        """
        # Use NLTK's word_tokenize for tokenization
        self.word_tokenize = word_tokenize

        # Load model and tokenizer
        self.threshold = threshold
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)

        # Move model to GPU if available
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        self.model.eval()  # Set to evaluation mode
        print(f"Model loaded successfully and moved to {self.device}")

    def parse_whisper_transcript(self, transcript_file):
        """
        Parse the Whisper transcript file to extract segments with timestamps

        Args:
            transcript_file: Path to the Whisper transcript file

        Returns:
            List of segments with text and timestamps
        """
        segments = []

        # Check the file format (JSON or text)
        if transcript_file.endswith('.json'):
            # Parse JSON format
            with open(transcript_file, 'r', encoding='utf-8') as f:
                data = json.load(f)

            # Handle different Whisper JSON formats
            if 'segments' in data:
                # Standard Whisper JSON format
                for segment in data['segments']:
                    segments.append({
                        'text': segment['text'].strip(),
                        'start': segment['start'],
                        'end': segment['end']
                    })
            else:
                # Simple format with just timestamps and text
                for segment in data:
                    if 'text' in segment and 'start' in segment and 'end' in segment:
                        segments.append({
                            'text': segment['text'].strip(),
                            'start': segment['start'],
                            'end': segment['end']
                        })
        else:
            # Parse text format (assuming HH:MM:SS.mmm --> HH:MM:SS.mmm format)
            pattern = r'(\d+:\d+:\d+\.\d+) --> (\d+:\d+:\d+\.\d+)\s*\n(.*?)(?:\n\n|\Z)'

            with open(transcript_file, 'r', encoding='utf-8') as f:
                content = f.read()

            matches = re.findall(pattern, content, re.DOTALL)

            for match in matches:
                start_time, end_time, text = match

                # Convert timestamp to seconds
                def time_to_seconds(time_str):
                    h, m, s = time_str.split(':')
                    return float(h) * 3600 + float(m) * 60 + float(s)

                segments.append({
                    'text': text.strip(),
                    'start': time_to_seconds(start_time),
                    'end': time_to_seconds(end_time)
                })

        return segments

    def detect_offensive_words_in_segment(self, segment):
        """
        Detect offensive words in a segment of text

        Args:
            segment: Dictionary containing text and timestamps

        Returns:
            List of offensive words with their timestamps
        """
        text = segment['text']
        start_time = segment['start']
        end_time = segment['end']
        segment_duration = end_time - start_time

        # Tokenize the text into words
        words = self.word_tokenize(text.lower())

        # Get original words with case preserved
        original_words = self.word_tokenize(text)

        # Process each word
        offensive_words = []

        for i, (word_lower, word_original) in enumerate(zip(words, original_words)):
            if len(word_lower) <= 2 or not any(c.isalpha() for c in word_lower):
                continue

            # Get context (up to 3 words before and after)
            start_idx = max(0, i - 3)
            end_idx = min(len(words), i + 4)
            context = " ".join(words[start_idx:end_idx])

            # Use model to predict if the word is offensive
            inputs = self.tokenizer(context, return_tensors="pt")
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                logits = self.model(**inputs).logits

            # Get prediction score
            scores = torch.softmax(logits, dim=1)[0].tolist()
            offensive_score = scores[1]  # Probability of being offensive

            if offensive_score >= self.threshold:
                # Estimate timestamp for this word
                # Simple approach: assume words are evenly distributed in segment
                word_position = i / len(words)
                word_time = start_time + (segment_duration * word_position)

                offensive_words.append({
                    'word': word_original,
                    'confidence': offensive_score,
                    'timestamp': word_time,
                    'segment_start': start_time,
                    'segment_end': end_time,
                    'context': context
                })

        return offensive_words

    def process_transcript(self, transcript_file, output_file=None):
        """
        Process a Whisper transcript file to detect offensive words

        Args:
            transcript_file: Path to the Whisper transcript file
            output_file: Path to save the output JSON file (optional)

        Returns:
            Dictionary with detected offensive words and their timestamps
        """
        # Parse the transcript
        segments = self.parse_whisper_transcript(transcript_file)
        print(f"Parsed {len(segments)} segments from transcript")

        # Process each segment
        all_offensive_words = []

        for i, segment in enumerate(segments):
            print(f"Processing segment {i+1}/{len(segments)}")
            offensive_words = self.detect_offensive_words_in_segment(segment)
            all_offensive_words.extend(offensive_words)

        # Create result dictionary
        result = {
            'total_offensive_words': len(all_offensive_words),
            'offensive_words': all_offensive_words
        }

        # Save to file if output_file is provided
        if output_file:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(result, f, indent=2)
            print(f"Saved results to {output_file}")

        return result

# Part 2: Install required packages (Run this cell first)
# Only run these if they're not already installed
!pip install transformers torch nltk

# Part 3: Colab-specific implementation for processing Whisper transcripts
from google.colab import files
import os

def run_transcript_processing():
    """
    Main function to run the Whisper transcript processing in Colab
    """
    # Specify the paths
    transcript_file = "/content/transcription.json"  # Path to the Whisper transcript file
    model_path = "/content/offensive_word_nlp_model_final"  # Path to the offensive word detection model
    output_file = "/content/offensive_words.json"  # Path to save the output JSON file

    # Initialize the processor and process the transcript
    try:
        processor = WhisperTranscriptProcessor(model_path, threshold=0.7)

        print(f"\nProcessing transcript: {transcript_file}")
        result = processor.process_transcript(transcript_file, output_file)

        # Print summary
        print(f"\nFound {result['total_offensive_words']} offensive words in the transcript")

        # Print first few examples
        if result['offensive_words']:
            print("\nExamples of detected offensive words:")
            for i, word_info in enumerate(result['offensive_words'][:5]):  # Show first 5 examples
                print(f"{i+1}. '{word_info['word']}' at {word_info['timestamp']:.2f}s (confidence: {word_info['confidence']:.2f})")
                print(f"   Context: \"{word_info['context']}\"")

        # Download the output file
        print("\nStep 3: Download the results")
        files.download(output_file)

    except Exception as e:
        print(f"Error processing transcript: {str(e)}")

# Part 4: Run the processing function (Run this cell after the above cells)
# Execute this function to start the process
if __name__ == "__main__":
    run_transcript_processing()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Punkt data location: /root/nltk_data/tokenizers/punkt
punkt_tab file found!
Punkt tokenizer loaded successfully!
Model loaded successfully and moved to cuda

Processing transcript: /content/transcription.json
Parsed 9 segments from transcript
Processing segment 1/9
Processing segment 2/9
Processing segment 3/9
Processing segment 4/9
Processing segment 5/9
Processing segment 6/9
Processing segment 7/9
Processing segment 8/9
Processing segment 9/9
Saved results to /content/offensive_words.json

Found 70 offensive words in the transcript

Examples of detected offensive words:
1. 'all' at 0.46s (confidence: 0.99)
   Context: "you 're all you raggedy ass"
2. 'you' at 0.69s (confidence: 1.00)
   Context: "you 're all you raggedy ass niggas"
3. 'raggedy' at 0.92s (confidence: 1.00)
   Context: "'re all you raggedy ass niggas and"
4. 'ass' at 1.15s (confidence: 1.00)
   Context: "all you raggedy ass niggas and man-hows"
5. 'niggas' at 1.38s (confidence: 1.00)
   Context: "you raggedy ass nigga

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>