In [1]:
print('Hello World')

Hello World


In [2]:
!which python
!which pip

/home/crux/anaconda3/envs/audacle/bin/python
/home/crux/anaconda3/envs/audacle/bin/pip


In [3]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

1.13.1
True


### Functions and such

In [9]:
SR = 16000

In [10]:
import numpy as np
import ffmpeg

def load_audio(file: str, sr: int = SR):
    """
    Open an audio file and read as mono waveform, resampling as necessary

    Parameters
    ----------
    file: str
        The audio file to open

    sr: int
        The sample rate to resample the audio if necessary

    Returns
    -------
    A NumPy array containing the audio waveform, in float32 dtype.
    """
    try:
        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
        out, _ = (
            ffmpeg.input(file, threads=0)
            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
        )
    except ffmpeg.Error as e:
        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e

    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0  #Why 32768?

### Splitting a script into chunks small enough to be sent to chat-gpt

In [None]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
print(tokenizer("Hello world")['input_ids'])
print(tokenizer(" Hello world"))

In [None]:
import os
file = '../data/jotun 02-05-2023/script.txt'

with open(file, 'r') as f:
    text = f.read()

chunks = text.split('\n\n')
tsizes = [len(tokenizer(chunk)['input_ids']) for chunk in chunks]

In [None]:
posts = []
bag = []
total_tokens = 0
max_size = 3600
for chunk, tsize in zip(chunks, tsizes):
    if total_tokens + tsize > max_size:
        posts.append(bag)
        bag = []
        total_tokens = 0
    total_tokens += tsize
    bag.append(chunk)
else:
    posts.append(bag)

In [None]:
len(posts)

In [None]:
for i, post in enumerate(posts):
    print(len(post))
    post_text = '\n\n'.join(post)

    # with open(f"./posts/post_{i}.txt", 'w') as f:
    #     f.write(post_text)

### Caching models with HF transformers

In [1]:
import transformers
import os

pipelines_dir = './pretrained_pipelines'
model_repo="openai/whisper-medium.en"


whisper_asr = transformers.pipeline(model=model_repo)


In [2]:

savedir = os.path.join(pipelines_dir, model_repo)
whisper_asr.save_pretrained(savedir)
del whisper_asr


In [3]:
task = 'automatic-speech-recognition'
whisper_asr = transformers.pipeline(model=savedir, task=task)

In [4]:
whisper_asr.predict('only_speech.wav')



{'text': ' We all somehow lived Yeah is a pretty pretty amazing thing We got very lucky Apparently and I guess unintentionally very well-prepared'}

In [11]:
wav = load_audio('only_speech.wav')

In [12]:
whisper_asr.predict(wav)



{'text': ' We all somehow lived Yeah is a pretty pretty amazing thing We got very lucky Apparently and I guess unintentionally very well-prepared'}