In [1]:
!pip install openai-whisper keybert nltk vaderSentiment soundfile librosa --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.5/170.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
import zipfile
import os

# Paths
archive_path = '/content/drive/MyDrive/UserLibri/archive3.zip'  # your zip file
extract_path = '/content/drive/MyDrive/UserLibri/archive3_unzipped/'  # folder to unzip into

# Unzip if not already unzipped
if not os.path.exists(extract_path):
    with zipfile.ZipFile(archive_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"Archive extracted to: {extract_path}")
else:
    print(f"Archive already extracted at: {extract_path}")


Archive already extracted at: /content/drive/MyDrive/UserLibri/archive3_unzipped/


In [1]:
import os
import pandas as pd
import whisper
import nltk
from keybert import KeyBERT
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

nltk.download('stopwords')
nltk.download('punkt')

# Step 3: Set dataset path
dataset_path = '/content/drive/MyDrive/UserLibri/archive3_unzipped/UserLibri/audio_data/'

# Step 4: Efficiently collect all audio files (.flac, .mp3, .wav)
audio_files = []
for root, dirs, files in os.walk(dataset_path):
    for file in files:
        if file.endswith('.flac') or file.endswith('.mp3') or file.endswith('.wav'):
            audio_files.append(os.path.join(root, file))

print(f"Total audio files found: {len(audio_files)}")
audio_files[:5]  # show first 5 files

# Optional: Test with only first 5 files (remove [:5] to process all 4050)
audio_files = audio_files[:5]

# Step 5: Load Whisper model
model = whisper.load_model("base")  # "base" is fast; "small" or "medium" = more accurate

# Step 6: Initialize NLP tools
kw_model = KeyBERT()
analyzer = SentimentIntensityAnalyzer()

# Step 7: Text cleaning function
filler_words = ['uh', 'um', 'ah', 'erm', 'hmm']

def clean_text(text):
    text = text.lower()
    for filler in filler_words:
        text = text.replace(filler, '')
    text = ' '.join(text.split())
    return text

# Step 8: Process each audio file
all_data = []

for audio_file in audio_files:
    print(f"\nProcessing: {audio_file}")

    # 8a: Transcription
    result = model.transcribe(audio_file)
    transcript = result['text']

    # 8b: Clean / normalize text
    cleaned_text = clean_text(transcript)

    # 8c: Keyword extraction
    keywords = kw_model.extract_keywords(cleaned_text, top_n=5)
    keywords_list = [k[0] for k in keywords]

    # 8d: Sentiment analysis
    sentiment_score = analyzer.polarity_scores(cleaned_text)['compound']
    if sentiment_score >= 0.05:
        sentiment = 'Positive'
    elif sentiment_score <= -0.05:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'

    # 8e: Print output in real-time
    print("Clean Text:", cleaned_text)
    print("Keywords:", keywords_list)
    print("Sentiment:", sentiment)

    # 8f: Save results to list
    all_data.append({
        'Audio_File': os.path.basename(audio_file),
        'Clean_Text': cleaned_text,
        'Keywords': keywords_list,
        'Sentiment': sentiment
    })

# Step 9: Save all results to CSV
df = pd.DataFrame(all_data)
output_path = '/content/drive/MyDrive/UserLibri_Processed.csv'
df.to_csv(output_path, index=False)
print(f"\nProcessing complete! Output saved to: {output_path}")

# Optional: Show first 5 rows
df.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Total audio files found: 4050


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



Processing: /content/drive/MyDrive/UserLibri/archive3_unzipped/UserLibri/audio_data/test-clean/speaker-1089-book-4217/1089-134686-0000.flac




Clean Text: he hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.
Keywords: ['stew', 'potatoes', 'dinner', 'carrots', 'mutton']
Sentiment: Positive

Processing: /content/drive/MyDrive/UserLibri/archive3_unzipped/UserLibri/audio_data/test-clean/speaker-1089-book-4217/1089-134686-0001.flac




Clean Text: stuffed into you, his belly countled him.
Keywords: ['stuffed', 'belly', 'countled']
Sentiment: Neutral

Processing: /content/drive/MyDrive/UserLibri/archive3_unzipped/UserLibri/audio_data/test-clean/speaker-1089-book-4217/1089-134686-0002.flac




Clean Text: after early nightfall, the yellow lamps would light up here and there, the squalid quarter of the brothels.
Keywords: ['brothels', 'lamps', 'nightfall', 'squalid', 'yellow']
Sentiment: Neutral

Processing: /content/drive/MyDrive/UserLibri/archive3_unzipped/UserLibri/audio_data/test-clean/speaker-1089-book-4217/1089-134686-0003.flac




Clean Text: hello, bertie, any good in your mind?
Keywords: ['bertie', 'hello', 'good', 'mind']
Sentiment: Positive

Processing: /content/drive/MyDrive/UserLibri/archive3_unzipped/UserLibri/audio_data/test-clean/speaker-1089-book-4217/1089-134686-0004.flac




Clean Text: nber 10, fresh nelly is waiting on you. good night, husband.
Keywords: ['nelly', 'nber', 'fresh', 'husband', '10']
Sentiment: Positive

Processing complete! Output saved to: /content/drive/MyDrive/UserLibri_Processed.csv


Unnamed: 0,Audio_File,Clean_Text,Keywords,Sentiment
0,1089-134686-0000.flac,"he hoped there would be stew for dinner, turni...","[stew, potatoes, dinner, carrots, mutton]",Positive
1,1089-134686-0001.flac,"stuffed into you, his belly countled him.","[stuffed, belly, countled]",Neutral
2,1089-134686-0002.flac,"after early nightfall, the yellow lamps would ...","[brothels, lamps, nightfall, squalid, yellow]",Neutral
3,1089-134686-0003.flac,"hello, bertie, any good in your mind?","[bertie, hello, good, mind]",Positive
4,1089-134686-0004.flac,"nber 10, fresh nelly is waiting on you. good n...","[nelly, nber, fresh, husband, 10]",Positive
