<a href="https://colab.research.google.com/github/Nathan-Roll1/PSST/blob/main/Examples/Transcription_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PSST - Prosodic Speech Segmentation with transformers

Install transformers module

In [None]:
!pip install transformers

import libraries

In [None]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import librosa

Define initialization function

In [None]:
def init_model_processor(gpu=False):
  """ Initializes the model and processor with the pre-trained weights.

  Returns:
    model (AutoModelForSpeechSeq2Seq): A model with the pre-trained weights.
    processor (AutoProcessor): Processes audio data.
  """
  # Initialize the processor with the pre-trained weights
  processor = AutoProcessor.from_pretrained("NathanRoll/psst-medium-en")

  if gpu:
    # Initialize the model with the pre-trained weights and move it to the gpu
    model = AutoModelForSpeechSeq2Seq.from_pretrained("NathanRoll/psst-medium-en").to("cuda:0")
  else:
    # Initialize the model with the pre-trained weights
    model = AutoModelForSpeechSeq2Seq.from_pretrained("NathanRoll/psst-medium-en")

  return model, processor

Define generation function

In [None]:
def generate_transcription(audio, gpu=False):
  """Generate a transcription from audio using a pre-trained model

  Args:
    audio: The audio to be transcribed
    gpu: Whether to use GPU or not. Defaults to False.

  Returns:
    transcription: The transcribed text
  """
  # Preprocess audio and return tensors
  inputs = processor(audio, return_tensors="pt", sampling_rate=16000)

  # Assign inputs to GPU or CPU based on argument
  if gpu:
    input_features = inputs.input_features.cuda()
  else:
    input_features = inputs.input_features

  # Generate transcribed ids
  generated_ids = model.generate(inputs=input_features, max_length=250)

  # Decode generated ids and replace special tokens
  transcription = processor.batch_decode(
      generated_ids, skip_special_tokens=True, output_word_offsets=True)[0].replace('!!!!!', '<|IU_Boundary|>')

  return transcription

Load and resample audio file

In [None]:
y, sr = librosa.load('gettysburg.wav')
audio = librosa.resample(y, orig_sr=sr, target_sr=16000)

Initialize model and processor

In [None]:
model, processor = init_model_processor(gpu=True)

Generate Transcription

In [None]:
transcript = generate_transcription(audio, gpu=True)
transcript

'Four score and seven years ago <|IU_Boundary|> our fathers brought forth on this continent <|IU_Boundary|> a new nation <|IU_Boundary|> conceived in liberty <|IU_Boundary|> and dedicated to the proposition <|IU_Boundary|> that all men are created equal <|IU_Boundary|> Now we are engaged in a great civil war <|IU_Boundary|> testing whether that nation <|IU_Boundary|> or any nation so conceived and so dedicated <|IU_Boundary|> can long endure'