In [2]:
import torch
import librosa
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import soundfile as sf

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "distil-whisper/distil-large-v3"

# Load the model and processor
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="sdpa"
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

# Create the ASR pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    torch_dtype=torch_dtype,
    device=device,
)

# Load the audio file using librosa
audio_file = "/home/vmadmin/myenv/English/1010834_2024.09.14_08.06.27-2024.09.14_08.07.27.wav"
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Define the chunk duration in seconds
chunk_duration = 10
num_samples_per_chunk = chunk_duration * sample_rate

# Split the audio into chunks
chunks = []
for start in range(0, len(waveform), num_samples_per_chunk):
    end = min(start + num_samples_per_chunk, len(waveform))
    chunk = waveform[start:end]
    chunks.append(chunk)

# Transcribe each chunk
transcriptions = []
for i, chunk in enumerate(chunks):
    # Save the chunk to a temporary file
    chunk_file = f"temp_chunk_{i}.wav"
    sf.write(chunk_file, chunk, sample_rate)

    # Run the ASR pipeline on the chunk
    result = pipe(chunk_file, batch_size=8, return_timestamps=False)
    transcriptions.append(result["text"])

# Combine transcriptions
full_transcription = " ".join(transcriptions)
print(full_transcription)


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 on this former industrial site in Whitehaven, Cumbria, planned to produce 60 million tonnes of new coal. But since it was approved, a Supreme Court ruling found projects  consider carbon emissions from burning fossil fuels, not just digging them up. The mine was the first legal test of that. It's been a really important victory.  But what frustrates me most is that the years' decision makers have been putting all their efforts and fighting for a coal mine in Whitehaven. They could have spent that time investing in green jobs.  But it was mining jobs the community expected. This site has sat empty for 20 years. It's devastating for the community. The jobs that were going to be created were  possibly in the construction and supply chain surrounding it were probably around 2000 and the long-term well-paid jobs for running the facility  in excess of 500. The new government has been clear about its net zero ambitions, but judgments like this one could test them. We're still reliance on  Th



# Correct One

In [1]:
import torch
import librosa
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import soundfile as sf

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "distil-whisper/distil-large-v3"

# Load the model and processor
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="sdpa"
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

# Create the ASR pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    torch_dtype=torch_dtype,
    device=device,
)

# Load the audio file using librosa
audio_file = "/home/vmadmin/myenv/English/1010834_2024.09.14_08.06.27-2024.09.14_08.07.27.wav"
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Define the chunk duration in seconds
chunk_duration = 10
num_samples_per_chunk = chunk_duration * sample_rate

# Split the audio into chunks
chunks = []
for start in range(0, len(waveform), num_samples_per_chunk):
    end = min(start + num_samples_per_chunk, len(waveform))
    chunk = waveform[start:end]
    chunks.append((chunk, start / sample_rate, end / sample_rate))  # Include timestamps

# Transcribe each chunk
transcriptions = []
for i, (chunk, start_time, end_time) in enumerate(chunks):
    # Save the chunk to a temporary file
    chunk_file = f"temp_chunk_{i}.wav"
    sf.write(chunk_file, chunk, sample_rate)

    # Run the ASR pipeline on the chunk
    result = pipe(chunk_file, batch_size=8, return_timestamps=True)  # Set to True for timestamps
    transcriptions.append((result["text"], start_time, end_time))

# Print the transcriptions with timestamps
for text, start_time, end_time in transcriptions:
    print(f"[{start_time:.2f}s - {end_time:.2f}s] {text}")

# Optional: Combine transcriptions if needed
full_transcription = " ".join(text for text, _, _ in transcriptions)
print("\nFull Transcription:")
print(full_transcription)


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[0.00s - 10.00s]  on this former industrial site in Whitehaven, Cumbria, planned to produce 60 million tonnes of new coal. But since it was approved, a Supreme Court ruling found projects
[10.00s - 20.00s]  consider carbon emissions from burning fossil fuels, not just digging them up. The mine was the first legal test of that. It's been a really important victory.
[20.00s - 30.00s]  But what frustrates me most is that the years local decision makers have been putting all their efforts and fighting for a coal mine in Whitehaven. They could have spent that time investing in green jobs.
[30.00s - 40.00s]  But it was mining jobs the community expected. This site has sat empty for 20 years. It's devastating for the community. The jobs that were going to be created were
[40.00s - 50.00s]  Possibly in the construction and supply chain surrounding it were probably around 2000. And the long-term, well-paid jobs for running the facility work.
[50.00s - 60.00s]  in excess of 500. The new governme



In [3]:
import time

In [4]:
# Load the audio file using librosa
audio_file = "/home/vmadmin/myenv/English/clip1.mp3"
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Define the chunk duration in seconds
chunk_duration = 10
num_samples_per_chunk = chunk_duration * sample_rate

# Start timing
start_time = time.time()

# Split the audio into chunks
chunks = []
for start in range(0, len(waveform), num_samples_per_chunk):
    end = min(start + num_samples_per_chunk, len(waveform))
    chunk = waveform[start:end]
    chunks.append((chunk, start / sample_rate, end / sample_rate))  # Include timestamps

# Transcribe each chunk
transcriptions = []
for i, (chunk, start_time_chunk, end_time_chunk) in enumerate(chunks):
    # Save the chunk to a temporary file
    chunk_file = f"temp_chunk_{i}.wav"
    sf.write(chunk_file, chunk, sample_rate)

    # Run the ASR pipeline on the chunk
    result = pipe(chunk_file, batch_size=8, return_timestamps=True)  # Set to True for timestamps
    transcriptions.append((result["text"], start_time_chunk, end_time_chunk))

# Stop timing
end_time = time.time()
total_time = end_time - start_time

# Print the transcriptions with timestamps
for text, start_time_chunk, end_time_chunk in transcriptions:
    print(f"[{start_time_chunk:.2f}s - {end_time_chunk:.2f}s] {text}")

# Optional: Combine transcriptions if needed
full_transcription = " ".join(text for text, _, _ in transcriptions)
print("\nFull Transcription:")
print(full_transcription)

# Print total processed time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")

  waveform, sample_rate = librosa.load(audio_file, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
[src/libmpg123/parse.c:skip_junk():1276] error: Giving up searching valid MPEG header after 65536 bytes of junk.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[0.00s - 10.00s]  And as I bring you all the latest from the world of sports. Take a look. And all-run performance by India.
[10.00s - 20.00s]  beat Australia by 36 runs in the second semi-final of the ICC Women's World Cup on Thursday at Derby.
[20.00s - 30.00s]  Hermann Preet Kord produced one of the greatest ever ODIR Knox in women's cricket as India stormed into
[30.00s - 40.00s]  the final. Herman Preet's third ODIS century was embellished with 24s and as many as 7-6s.
[40.00s - 50.00s]  It is the second time that India has entered the summit clash of the global event having lost to Australia in the final of the 2020.
[50.00s - 60.00s]  2005 edition. It was also the second highest individual score in ODIs for India.
[60.00s - 70.00s]  behind Dik Tisharmas and beat in 188 runs against South Africa earlier this year. With enough runs on the board, confident Indian seamer
[70.00s - 78.30s]  Goswami and Shikha Pande came up with brilliant opening spells.

Full Transcription:
 And as I



In [5]:
# Load the audio file using librosa
audio_file = "/home/vmadmin/myenv/English/clip2.mp3"
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Define the chunk duration in seconds
chunk_duration = 10
num_samples_per_chunk = chunk_duration * sample_rate

# Start timing
start_time = time.time()

# Split the audio into chunks
chunks = []
for start in range(0, len(waveform), num_samples_per_chunk):
    end = min(start + num_samples_per_chunk, len(waveform))
    chunk = waveform[start:end]
    chunks.append((chunk, start / sample_rate, end / sample_rate))  # Include timestamps

# Transcribe each chunk
transcriptions = []
for i, (chunk, start_time_chunk, end_time_chunk) in enumerate(chunks):
    # Save the chunk to a temporary file
    chunk_file = f"temp_chunk_{i}.wav"
    sf.write(chunk_file, chunk, sample_rate)

    # Run the ASR pipeline on the chunk
    result = pipe(chunk_file, batch_size=8, return_timestamps=True)  # Set to True for timestamps
    transcriptions.append((result["text"], start_time_chunk, end_time_chunk))

# Stop timing
end_time = time.time()
total_time = end_time - start_time

# Print the transcriptions with timestamps
for text, start_time_chunk, end_time_chunk in transcriptions:
    print(f"[{start_time_chunk:.2f}s - {end_time_chunk:.2f}s] {text}")

# Optional: Combine transcriptions if needed
full_transcription = " ".join(text for text, _, _ in transcriptions)
print("\nFull Transcription:")
print(full_transcription)

# Print total processed time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")

  waveform, sample_rate = librosa.load(audio_file, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
[src/libmpg123/parse.c:skip_junk():1276] error: Giving up searching valid MPEG header after 65536 bytes of junk.


[0.00s - 10.00s]  Moving now to some tragic news coming in from Jammu and Kashmir. Well, right ahead of the Prime Minister's visit a major encounter in Jambun Kishmir's Kishthwar. In fact, tragically, two soldiers.
[10.00s - 20.00s]  have died in the line of duty. In fact, several other soldiers have been injured as well. There was a brief gunfight is what we can report at this moment.
[20.00s - 30.00s]  in Jammu Kishmir's Kistvar and two soldiers have died. Of course, these are soldiers leading from the front and they've been killed in the line of action,
[30.00s - 40.00s]  tragic there and this comes after there was a major arms hall as well in parts of Jammu in Kashmir and the timing of the encounter is also extremely important given the fact
[40.00s - 50.00s]  that this comes right ahead of the assembly polls. The assembly pole just a week away, less than a week away perhaps. The first phase will be held on the 18th
[50.00s - 60.00s]  of September, so extremely concerning there and

In [6]:
# Load the audio file using librosa
audio_file = "/home/vmadmin/myenv/English/clip3.mp3"
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Define the chunk duration in seconds
chunk_duration = 10
num_samples_per_chunk = chunk_duration * sample_rate

# Start timing
start_time = time.time()

# Split the audio into chunks
chunks = []
for start in range(0, len(waveform), num_samples_per_chunk):
    end = min(start + num_samples_per_chunk, len(waveform))
    chunk = waveform[start:end]
    chunks.append((chunk, start / sample_rate, end / sample_rate))  # Include timestamps

# Transcribe each chunk
transcriptions = []
for i, (chunk, start_time_chunk, end_time_chunk) in enumerate(chunks):
    # Save the chunk to a temporary file
    chunk_file = f"temp_chunk_{i}.wav"
    sf.write(chunk_file, chunk, sample_rate)

    # Run the ASR pipeline on the chunk
    result = pipe(chunk_file, batch_size=8, return_timestamps=True)  # Set to True for timestamps
    transcriptions.append((result["text"], start_time_chunk, end_time_chunk))

# Stop timing
end_time = time.time()
total_time = end_time - start_time

# Print the transcriptions with timestamps
for text, start_time_chunk, end_time_chunk in transcriptions:
    print(f"[{start_time_chunk:.2f}s - {end_time_chunk:.2f}s] {text}")

# Optional: Combine transcriptions if needed
full_transcription = " ".join(text for text, _, _ in transcriptions)
print("\nFull Transcription:")
print(full_transcription)

# Print total processed time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")

  waveform, sample_rate = librosa.load(audio_file, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
[src/libmpg123/parse.c:skip_junk():1276] error: Giving up searching valid MPEG header after 65536 bytes of junk.


[0.00s - 10.00s]  When I was shocked to go to the Howe it would be on the without a heart of I won't championship in a medal
[10.00s - 20.00s]  I didn't know that Periarchic
[20.00s - 30.00s]  what was what was. A trial was Pera then I got that the Paira are different and the Able's different.
[30.00s - 40.00s]  I had to know that the perjury also, I thought I'd say I'd say I'd say all right. And I'm not even though
[40.00s - 50.00s]  you can't do you can't you can't do you do you're doing so I'm doing then I'm doing a little up
[50.00s - 60.00s]  I had to do it easy to do it. Strength, 6, 7 hours we practiced. Then I was I was in the goal, then I went to
[60.00s - 70.00s]  I was one-being. When I was in the I was there I would be I would be I could do something that I gave one thing
[70.00s - 74.81s]  Thank you.

Full Transcription:
 When I was shocked to go to the Howe it would be on the without a heart of I won't championship in a medal  I didn't know that Periarchic  what was what 



In [8]:
# Load the audio file using librosa
audio_file = "/home/vmadmin/myenv/English/clip4.mp3"
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Define the chunk duration in seconds
chunk_duration = 10
num_samples_per_chunk = chunk_duration * sample_rate

# Start timing
start_time = time.time()

# Split the audio into chunks
chunks = []
for start in range(0, len(waveform), num_samples_per_chunk):
    end = min(start + num_samples_per_chunk, len(waveform))
    chunk = waveform[start:end]
    chunks.append((chunk, start / sample_rate, end / sample_rate))  # Include timestamps

# Transcribe each chunk
transcriptions = []
for i, (chunk, start_time_chunk, end_time_chunk) in enumerate(chunks):
    # Save the chunk to a temporary file
    chunk_file = f"temp_chunk_{i}.wav"
    sf.write(chunk_file, chunk, sample_rate)

    # Run the ASR pipeline on the chunk
    result = pipe(chunk_file, batch_size=8, return_timestamps=True)  # Set to True for timestamps
    transcriptions.append((result["text"], start_time_chunk, end_time_chunk))

# Stop timing
end_time = time.time()
total_time = end_time - start_time

# Print the transcriptions with timestamps
for text, start_time_chunk, end_time_chunk in transcriptions:
    print(f"[{start_time_chunk:.2f}s - {end_time_chunk:.2f}s] {text}")

# Optional: Combine transcriptions if needed
full_transcription = " ".join(text for text, _, _ in transcriptions)
print("\nFull Transcription:")
print(full_transcription)

# Print total processed time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")

  waveform, sample_rate = librosa.load(audio_file, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
[src/libmpg123/parse.c:skip_junk():1276] error: Giving up searching valid MPEG header after 65536 bytes of junk.


[0.00s - 10.00s]  We are struggling with some issues. There's a big debate in this country, in our country, about manufacturing. Now the fact is, I mean there are people who would
[10.00s - 20.00s]  why we're importing so much from China? Part of it is because we neglected manufacturing through the 1960s, 70s, 80s and 90s.
[20.00s - 30.00s]  maybe even the first decade of 2000. When you, I mean, think back, I mean, all of you are as connected and have as good a memory about our country as I
[30.00s - 40.00s]  when did we actually have governments who made a major push on manufacturing and let the people who today come and say no we need to find a fix
[40.00s - 50.00s]  as though it's something you can do instead in mystery. In fact, and rather, other people who actually also say that we are incapable of it. We should not even attempt it.
[50.00s - 60.00s]  So, now ask yourself, can you actually be a major power in the world without manufacturing?
[60.00s - 70.00s]  because a major powe



In [9]:
# Load the audio file using librosa
audio_file = "/home/vmadmin/myenv/English/1010834_2024.09.14_08.06.27-2024.09.14_08.07.27.wav"
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Define the chunk duration in seconds
chunk_duration = 10
num_samples_per_chunk = chunk_duration * sample_rate

# Start timing
start_time = time.time()

# Split the audio into chunks
chunks = []
for start in range(0, len(waveform), num_samples_per_chunk):
    end = min(start + num_samples_per_chunk, len(waveform))
    chunk = waveform[start:end]
    chunks.append((chunk, start / sample_rate, end / sample_rate))  # Include timestamps

# Transcribe each chunk
transcriptions = []
for i, (chunk, start_time_chunk, end_time_chunk) in enumerate(chunks):
    # Save the chunk to a temporary file
    chunk_file = f"temp_chunk_{i}.wav"
    sf.write(chunk_file, chunk, sample_rate)

    # Run the ASR pipeline on the chunk
    result = pipe(chunk_file, batch_size=8, return_timestamps=True)  # Set to True for timestamps
    transcriptions.append((result["text"], start_time_chunk, end_time_chunk))

# Stop timing
end_time = time.time()
total_time = end_time - start_time

# Print the transcriptions with timestamps
for text, start_time_chunk, end_time_chunk in transcriptions:
    print(f"[{start_time_chunk:.2f}s - {end_time_chunk:.2f}s] {text}")

# Optional: Combine transcriptions if needed
full_transcription = " ".join(text for text, _, _ in transcriptions)
print("\nFull Transcription:")
print(full_transcription)

# Print total processed time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")



[0.00s - 10.00s]  on this former industrial site in Whitehaven, Cumbria, planned to produce 60 million tonnes of new coal. But since it was approved, a Supreme Court ruling found projects
[10.00s - 20.00s]  consider carbon emissions from burning fossil fuels, not just digging them up. The mine was the first legal test of that. It's been a really important victory.
[20.00s - 30.00s]  But what frustrates me most is that the years local decision makers have been putting all their efforts and fighting for a coal mine in Whitehaven. They could have spent that time investing in green jobs.
[30.00s - 40.00s]  But it was mining jobs the community expected. This site has sat empty for 20 years. It's devastating for the community. The jobs that were going to be created were
[40.00s - 50.00s]  Possibly in the construction and supply chain surrounding it were probably around 2000. And the long-term, well-paid jobs for running the facility work.
[50.00s - 60.00s]  in excess of 500. The new governme



In [10]:
# Load the audio file using librosa
audio_file = "/home/vmadmin/myenv/English/1010834_2024.09.14_08.05.27-2024.09.14_08.06.27.wav"
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Define the chunk duration in seconds
chunk_duration = 10
num_samples_per_chunk = chunk_duration * sample_rate

# Start timing
start_time = time.time()

# Split the audio into chunks
chunks = []
for start in range(0, len(waveform), num_samples_per_chunk):
    end = min(start + num_samples_per_chunk, len(waveform))
    chunk = waveform[start:end]
    chunks.append((chunk, start / sample_rate, end / sample_rate))  # Include timestamps

# Transcribe each chunk
transcriptions = []
for i, (chunk, start_time_chunk, end_time_chunk) in enumerate(chunks):
    # Save the chunk to a temporary file
    chunk_file = f"temp_chunk_{i}.wav"
    sf.write(chunk_file, chunk, sample_rate)

    # Run the ASR pipeline on the chunk
    result = pipe(chunk_file, batch_size=8, return_timestamps=True)  # Set to True for timestamps
    transcriptions.append((result["text"], start_time_chunk, end_time_chunk))

# Stop timing
end_time = time.time()
total_time = end_time - start_time

# Print the transcriptions with timestamps
for text, start_time_chunk, end_time_chunk in transcriptions:
    print(f"[{start_time_chunk:.2f}s - {end_time_chunk:.2f}s] {text}")

# Optional: Combine transcriptions if needed
full_transcription = " ".join(text for text, _, _ in transcriptions)
print("\nFull Transcription:")
print(full_transcription)

# Print total processed time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")



[0.00s - 10.00s]  content we flagged was taken down, but it's still possible to find more. A century old ideology of hate, pushed by cutting-edge algorithms,
[10.00s - 20.00s]  to a massive, modern audience. Tom Cheshire, Sky News. The UK's High Court has reversed the decision to approve
[20.00s - 30.00s]  the UK's first new coal mine in 30 years. It follows the confirmation this week that nearly 3,000 jobs will be lost at the Port Talbot Steelworks and potentially
[30.00s - 40.00s]  potentially 400 at Scotland's only oil refinery Grangemouth. Our science and technology editor Tom Clark reports now on the UK's changing
[40.00s - 50.00s]  credentials. Leave it in the ground. Leave it in the ground. Their case argued UK coal has no future on a rapidly warming
[50.00s - 60.00s]  We have won. The High Court pretty much agreed. It's a huge win. And this, the proposed coal mine that lost. The Woodhouse Colliery
[60.00s - 60.01s]  Thank you.

Full Transcription:
 content we flagged was taken 



In [11]:
# Load the audio file using librosa
audio_file = "/home/vmadmin/myenv/English/1010834_2024.09.14_08.04.27-2024.09.14_08.05.27.wav"
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Define the chunk duration in seconds
chunk_duration = 10
num_samples_per_chunk = chunk_duration * sample_rate

# Start timing
start_time = time.time()

# Split the audio into chunks
chunks = []
for start in range(0, len(waveform), num_samples_per_chunk):
    end = min(start + num_samples_per_chunk, len(waveform))
    chunk = waveform[start:end]
    chunks.append((chunk, start / sample_rate, end / sample_rate))  # Include timestamps

# Transcribe each chunk
transcriptions = []
for i, (chunk, start_time_chunk, end_time_chunk) in enumerate(chunks):
    # Save the chunk to a temporary file
    chunk_file = f"temp_chunk_{i}.wav"
    sf.write(chunk_file, chunk, sample_rate)

    # Run the ASR pipeline on the chunk
    result = pipe(chunk_file, batch_size=8, return_timestamps=True)  # Set to True for timestamps
    transcriptions.append((result["text"], start_time_chunk, end_time_chunk))

# Stop timing
end_time = time.time()
total_time = end_time - start_time

# Print the transcriptions with timestamps
for text, start_time_chunk, end_time_chunk in transcriptions:
    print(f"[{start_time_chunk:.2f}s - {end_time_chunk:.2f}s] {text}")

# Optional: Combine transcriptions if needed
full_transcription = " ".join(text for text, _, _ in transcriptions)
print("\nFull Transcription:")
print(full_transcription)

# Print total processed time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")



[0.00s - 10.00s]  content to people who don't really understand but think it's cool or funny. However, the impact on the victim is the same, which is the kind of experience of hate of minority
[10.00s - 20.00s]  communities. This is one of the musicians whose songs have been bolted on to Nazi content without their knowledge.
[20.00s - 30.00s]  Algois told us I was not previously aware that my music was being used in this way and I find it shocking and deplorable.
[30.00s - 40.00s]  Sky News previously reported about Islamic state supporters using the same sounds loophole to gain more traction on TikTok. We forwarded all the Nazi videos
[40.00s - 50.00s]  we found this time to TikTok and asked the company for comment. A spokesperson told us, this content was immediately removed for breaching our strict policies against hate speech.
[50.00s - 60.00s]  train our safety professionals and update our safeguards to detect hateful behaviour on an ongoing basis, and we remove 91% of this type o



In [12]:
# Load the audio file using librosa
audio_file = "/home/vmadmin/myenv/English/1010834_2024.09.14_08.04.27-2024.09.14_08.05.27.wav"
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Define the chunk duration in seconds
chunk_duration = 10
num_samples_per_chunk = chunk_duration * sample_rate

# Start timing
start_time = time.time()

# Split the audio into chunks
chunks = []
for start in range(0, len(waveform), num_samples_per_chunk):
    end = min(start + num_samples_per_chunk, len(waveform))
    chunk = waveform[start:end]
    chunks.append((chunk, start / sample_rate, end / sample_rate))  # Include timestamps

# Transcribe each chunk
transcriptions = []
for i, (chunk, start_time_chunk, end_time_chunk) in enumerate(chunks):
    # Save the chunk to a temporary file
    chunk_file = f"temp_chunk_{i}.wav"
    sf.write(chunk_file, chunk, sample_rate)

    # Run the ASR pipeline on the chunk
    result = pipe(chunk_file, batch_size=8, return_timestamps=True)  # Set to True for timestamps
    transcriptions.append((result["text"], start_time_chunk, end_time_chunk))

# Stop timing
end_time = time.time()
total_time = end_time - start_time

# Print the transcriptions with timestamps
for text, start_time_chunk, end_time_chunk in transcriptions:
    print(f"[{start_time_chunk:.2f}s - {end_time_chunk:.2f}s] {text}")

# Optional: Combine transcriptions if needed
full_transcription = " ".join(text for text, _, _ in transcriptions)
print("\nFull Transcription:")
print(full_transcription)

# Print total processed time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")



[0.00s - 10.00s]  content to people who don't really understand but think it's cool or funny. However, the impact on the victim is the same, which is the kind of experience of hate of minority
[10.00s - 20.00s]  communities. This is one of the musicians whose songs have been bolted on to Nazi content without their knowledge.
[20.00s - 30.00s]  Algois told us I was not previously aware that my music was being used in this way and I find it shocking and deplorable.
[30.00s - 40.00s]  Sky News previously reported about Islamic state supporters using the same sounds loophole to gain more traction on TikTok. We forwarded all the Nazi videos
[40.00s - 50.00s]  we found this time to TikTok and asked the company for comment. A spokesperson told us, this content was immediately removed for breaching our strict policies against hate speech.
[50.00s - 60.00s]  train our safety professionals and update our safeguards to detect hateful behaviour on an ongoing basis, and we remove 91% of this type o



In [14]:
# Load the audio file using librosa
audio_file = "/home/vmadmin/myenv/English/1010834_2024.09.14_08.02.27-2024.09.14_08.03.27.wav"
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Define the chunk duration in seconds
chunk_duration = 10
num_samples_per_chunk = chunk_duration * sample_rate

# Start timing
start_time = time.time()

# Split the audio into chunks
chunks = []
for start in range(0, len(waveform), num_samples_per_chunk):
    end = min(start + num_samples_per_chunk, len(waveform))
    chunk = waveform[start:end]
    chunks.append((chunk, start / sample_rate, end / sample_rate))  # Include timestamps

# Transcribe each chunk
transcriptions = []
for i, (chunk, start_time_chunk, end_time_chunk) in enumerate(chunks):
    # Save the chunk to a temporary file
    chunk_file = f"temp_chunk_{i}.wav"
    sf.write(chunk_file, chunk, sample_rate)

    # Run the ASR pipeline on the chunk
    result = pipe(chunk_file, batch_size=8, return_timestamps=True)  # Set to True for timestamps
    transcriptions.append((result["text"], start_time_chunk, end_time_chunk))

# Stop timing
end_time = time.time()
total_time = end_time - start_time

# Print the transcriptions with timestamps
for text, start_time_chunk, end_time_chunk in transcriptions:
    print(f"[{start_time_chunk:.2f}s - {end_time_chunk:.2f}s] {text}")

# Optional: Combine transcriptions if needed
full_transcription = " ".join(text for text, _, _ in transcriptions)
print("\nFull Transcription:")
print(full_transcription)

# Print total processed time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")



[0.00s - 10.00s]  Nazi speeches and marching music have been used as background sound on tens of thousands of TikTok videos as far-right groups try and spread their
[10.00s - 20.00s]  message. To appeal to a wider audience, most of the speeches are set to a type of music popular on TikTok called DriftFon.
[20.00s - 30.00s]  without the creator's permission or knowledge. And that could be all sorts, cat videos, gym posts, gaming or cars here. There's a few of the most popular categories we have seen
[30.00s - 40.00s]  It's a way to get content shared widely before offering the user more sinister stuff if they hit the sound button in the corner of a post, which shows them other videos using the same sound.
[40.00s - 50.00s]  For example, this is a more innocuous video of a cat that looks like Hitler. We'll put that back into the stack, and this is a huge stack here. Have a look at another type we've seen gaming. This video was made
[50.00s - 60.00s]  using Minecraft, the German dictator,



In [16]:
# Load the audio file using librosa
audio_file = "/home/vmadmin/myenv/English/1010259_2024.09.14_08.04.28-2024.09.14_08.05.28.wav"
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Define the chunk duration in seconds
chunk_duration = 10
num_samples_per_chunk = chunk_duration * sample_rate

# Start timing
start_time = time.time()

# Split the audio into chunks
chunks = []
for start in range(0, len(waveform), num_samples_per_chunk):
    end = min(start + num_samples_per_chunk, len(waveform))
    chunk = waveform[start:end]
    chunks.append((chunk, start / sample_rate, end / sample_rate))  # Include timestamps

# Transcribe each chunk
transcriptions = []
for i, (chunk, start_time_chunk, end_time_chunk) in enumerate(chunks):
    # Save the chunk to a temporary file
    chunk_file = f"temp_chunk_{i}.wav"
    sf.write(chunk_file, chunk, sample_rate)

    # Run the ASR pipeline on the chunk
    result = pipe(chunk_file, batch_size=8, return_timestamps=True)  # Set to True for timestamps
    transcriptions.append((result["text"], start_time_chunk, end_time_chunk))

# Stop timing
end_time = time.time()
total_time = end_time - start_time

# Print the transcriptions with timestamps
for text, start_time_chunk, end_time_chunk in transcriptions:
    print(f"[{start_time_chunk:.2f}s - {end_time_chunk:.2f}s] {text}")

# Optional: Combine transcriptions if needed
full_transcription = " ".join(text for text, _, _ in transcriptions)
print("\nFull Transcription:")
print(full_transcription)

# Print total processed time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")



[0.00s - 10.00s]  assembly seats and in second and third phase primates would be campaigning in Riyasi district in Katra Vashemata
[10.00s - 20.00s]  the reconnaissance, Qutwa, Jammu. So, an extensive campaigning. Hormrishah has already addressed frilly's here. He is coming here again in next few days.
[20.00s - 30.00s]  So, other senior leaders, BJP President, JAPNEDA, Defense Minister Rajna Singh recently addressed Riley in Ramban and Banha as well.
[30.00s - 40.00s]  So, top leaders, top guns of the BJP are campaigning in Jhman, Kashmir as the first phase of polling is approaching to just four days away.
[40.00s - 50.00s]  Now, Zheer, stay on with us. There's some more news coming in from Jammu and Kashmir. Like you had mentioned, there was an encounter that took place in Jambun Kishmi right ahead of the Prime Minister's visit three.
[50.00s - 60.00s]  separate encounters in fact in Kishwar two soldiers have unfortunately died two terrorists were killed in Katwa in the encounter and



In [17]:
# Load the audio file using librosa
audio_file = "/home/vmadmin/myenv/English/1010834_2024.09.14_08.03.27-2024.09.14_08.04.27.wav"
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Define the chunk duration in seconds
chunk_duration = 10
num_samples_per_chunk = chunk_duration * sample_rate

# Start timing
start_time = time.time()

# Split the audio into chunks
chunks = []
for start in range(0, len(waveform), num_samples_per_chunk):
    end = min(start + num_samples_per_chunk, len(waveform))
    chunk = waveform[start:end]
    chunks.append((chunk, start / sample_rate, end / sample_rate))  # Include timestamps

# Transcribe each chunk
transcriptions = []
for i, (chunk, start_time_chunk, end_time_chunk) in enumerate(chunks):
    # Save the chunk to a temporary file
    chunk_file = f"temp_chunk_{i}.wav"
    sf.write(chunk_file, chunk, sample_rate)

    # Run the ASR pipeline on the chunk
    result = pipe(chunk_file, batch_size=8, return_timestamps=True)  # Set to True for timestamps
    transcriptions.append((result["text"], start_time_chunk, end_time_chunk))

# Stop timing
end_time = time.time()
total_time = end_time - start_time

# Print the transcriptions with timestamps
for text, start_time_chunk, end_time_chunk in transcriptions:
    print(f"[{start_time_chunk:.2f}s - {end_time_chunk:.2f}s] {text}")

# Optional: Combine transcriptions if needed
full_transcription = " ".join(text for text, _, _ in transcriptions)
print("\nFull Transcription:")
print(full_transcription)

# Print total processed time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")



[0.00s - 10.00s]  popular with children and the owner of Minecraft, Microsoft told us that hate speech and terrorist content is strictly forbidden and they take action to remove such content. But on TikTok, there are posts
[10.00s - 20.00s]  that are just two graphic to show, specifically anti-Semitic. We've blurred this one here, which shows images from gas chambers, set to the same type of audio. And there's much more of this graphic
[20.00s - 30.00s]  type of contact. In fact, Sky News has seen 72,000 posts used in this way. Not only is that number big, but the level of engagement is high too.
[30.00s - 40.00s]  Between them, these posts have racked up 21 million likes, showing people are engaging with the videos. Well, how are they engaging? This is a good example, an image of a Nuremberg rally.
[40.00s - 50.00s]  accompanied by a hitler speech, it's been liked by more than 56,000 users. And in a comment, there's been liked 1,695 times, one user states, modern 20s,
[50.00s - 60.00s



In [18]:
# Load the audio file using librosa
audio_file = "/home/vmadmin/myenv/English/1010259_2024.09.14_08.04.28-2024.09.14_08.05.28.wav"
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Define the chunk duration in seconds
chunk_duration = 10
num_samples_per_chunk = chunk_duration * sample_rate

# Start timing
start_time = time.time()

# Split the audio into chunks
chunks = []
for start in range(0, len(waveform), num_samples_per_chunk):
    end = min(start + num_samples_per_chunk, len(waveform))
    chunk = waveform[start:end]
    chunks.append((chunk, start / sample_rate, end / sample_rate))  # Include timestamps

# Transcribe each chunk
transcriptions = []
for i, (chunk, start_time_chunk, end_time_chunk) in enumerate(chunks):
    # Save the chunk to a temporary file
    chunk_file = f"temp_chunk_{i}.wav"
    sf.write(chunk_file, chunk, sample_rate)

    # Run the ASR pipeline on the chunk
    result = pipe(chunk_file, batch_size=8, return_timestamps=True)  # Set to True for timestamps
    transcriptions.append((result["text"], start_time_chunk, end_time_chunk))

# Stop timing
end_time = time.time()
total_time = end_time - start_time

# Print the transcriptions with timestamps
for text, start_time_chunk, end_time_chunk in transcriptions:
    print(f"[{start_time_chunk:.2f}s - {end_time_chunk:.2f}s] {text}")

# Optional: Combine transcriptions if needed
full_transcription = " ".join(text for text, _, _ in transcriptions)
print("\nFull Transcription:")
print(full_transcription)

# Print total processed time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")



[0.00s - 10.00s]  assembly seats and in second and third phase primates would be campaigning in Riyasi district in Katra Vashemata
[10.00s - 20.00s]  the reconnaissance, Qutwa, Jammu. So, an extensive campaigning. Hormrishah has already addressed frilly's here. He is coming here again in next few days.
[20.00s - 30.00s]  So, other senior leaders, BJP President, JAPNEDA, Defense Minister Rajna Singh recently addressed Riley in Ramban and Banha as well.
[30.00s - 40.00s]  So, top leaders, top guns of the BJP are campaigning in Jhman, Kashmir as the first phase of polling is approaching to just four days away.
[40.00s - 50.00s]  Now, Zheer, stay on with us. There's some more news coming in from Jammu and Kashmir. Like you had mentioned, there was an encounter that took place in Jambun Kishmi right ahead of the Prime Minister's visit three.
[50.00s - 60.00s]  separate encounters in fact in Kishwar two soldiers have unfortunately died two terrorists were killed in Katwa in the encounter and



In [19]:
# Load the audio file using librosa
audio_file = "/home/vmadmin/myenv/English/1010259_2024.09.14_08.03.28-2024.09.14_08.04.28.wav"
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Define the chunk duration in seconds
chunk_duration = 10
num_samples_per_chunk = chunk_duration * sample_rate

# Start timing
start_time = time.time()

# Split the audio into chunks
chunks = []
for start in range(0, len(waveform), num_samples_per_chunk):
    end = min(start + num_samples_per_chunk, len(waveform))
    chunk = waveform[start:end]
    chunks.append((chunk, start / sample_rate, end / sample_rate))  # Include timestamps

# Transcribe each chunk
transcriptions = []
for i, (chunk, start_time_chunk, end_time_chunk) in enumerate(chunks):
    # Save the chunk to a temporary file
    chunk_file = f"temp_chunk_{i}.wav"
    sf.write(chunk_file, chunk, sample_rate)

    # Run the ASR pipeline on the chunk
    result = pipe(chunk_file, batch_size=8, return_timestamps=True)  # Set to True for timestamps
    transcriptions.append((result["text"], start_time_chunk, end_time_chunk))

# Stop timing
end_time = time.time()
total_time = end_time - start_time

# Print the transcriptions with timestamps
for text, start_time_chunk, end_time_chunk in transcriptions:
    print(f"[{start_time_chunk:.2f}s - {end_time_chunk:.2f}s] {text}")

# Optional: Combine transcriptions if needed
full_transcription = " ".join(text for text, _, _ in transcriptions)
print("\nFull Transcription:")
print(full_transcription)

# Print total processed time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")



[0.00s - 10.00s]  voters and galvanize them, get win seats for the BJP. Doda-Kistewar area is a very significant
[10.00s - 20.00s]  important region for the BJP when it comes to increasing the number of seats in Jammu province. So besides addressing this rally in Duda,
[20.00s - 30.00s]  and Kistvar and Ramban are part of it because it was essentially one district in until 2008 and then they
[30.00s - 40.00s]  were two more districts where carved out of 12 Doda districts. So it has eight assembly segments and BJP is fighting from all the eight, even as
[40.00s - 50.00s]  these are Muslim majority region, but some of the, during the delimitation, some of the conestrances have been carved out which have become
[50.00s - 60.00s]  the Hindu majority seats of BJP is hoping to win big in from this this election from Doda, Kishdwa-Ram and district which has 8
[60.00s - 60.03s]  Thank you.

Full Transcription:
 voters and galvanize them, get win seats for the BJP. Doda-Kistewar area is a very 



In [20]:
# Load the audio file using librosa
audio_file = "/home/vmadmin/myenv/English/1010259_2024.09.14_08.02.28-2024.09.14_08.03.28.wav"
waveform, sample_rate = librosa.load(audio_file, sr=None)

# Define the chunk duration in seconds
chunk_duration = 10
num_samples_per_chunk = chunk_duration * sample_rate

# Start timing
start_time = time.time()

# Split the audio into chunks
chunks = []
for start in range(0, len(waveform), num_samples_per_chunk):
    end = min(start + num_samples_per_chunk, len(waveform))
    chunk = waveform[start:end]
    chunks.append((chunk, start / sample_rate, end / sample_rate))  # Include timestamps

# Transcribe each chunk
transcriptions = []
for i, (chunk, start_time_chunk, end_time_chunk) in enumerate(chunks):
    # Save the chunk to a temporary file
    chunk_file = f"temp_chunk_{i}.wav"
    sf.write(chunk_file, chunk, sample_rate)

    # Run the ASR pipeline on the chunk
    result = pipe(chunk_file, batch_size=8, return_timestamps=True)  # Set to True for timestamps
    transcriptions.append((result["text"], start_time_chunk, end_time_chunk))

# Stop timing
end_time = time.time()
total_time = end_time - start_time

# Print the transcriptions with timestamps
for text, start_time_chunk, end_time_chunk in transcriptions:
    print(f"[{start_time_chunk:.2f}s - {end_time_chunk:.2f}s] {text}")

# Optional: Combine transcriptions if needed
full_transcription = " ".join(text for text, _, _ in transcriptions)
print("\nFull Transcription:")
print(full_transcription)

# Print total processed time
print(f"\nTotal Processing Time: {total_time:.2f} seconds")



[0.00s - 10.00s]  to be addressed by the Prime Minister in Duda near Khrushhtwar of Jammun Kashmir ahead of the first phase of polling on the 18th of September. Now, Prime Minister Modi had last campaign for the BJP in the
[10.00s - 20.00s]  Chenab region during the 2014 Assembly elections when the party had won four of the six seats. Remember the number of seats after delimitation has increased to eight. Ahead of the
[20.00s - 30.00s]  minister's visit there was an encounter that broke out in Kishhtwar as well. The prime minister will be visiting Duda after 45 years at least. My colleague Nazir joins us with more details. Nazir, if you could take
[30.00s - 40.00s]  take us through the Prime Minister's visit and the big campaign for the BJP in Japan-Kashmir. Well, it is Prime Minister Narinan Modi's first election
[40.00s - 50.00s]  rally and in Jama and Kashmir's assembly elections four days ahead of the first phase of assembly polling so very significant
[50.00s - 60.00s]  significan

