In [45]:
pip install boto3 pandas python-Levenshtein python-dotenv


Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [46]:
import boto3
import pandas as pd
import time
from dotenv import load_dotenv
import os
import Levenshtein

In [51]:
recordings_df = pd.read_csv('../data/overview-of-recordings.csv')
recordings_df = recordings_df[['file_name', 'phrase']].rename(columns={'phrase':'actual_transcription'}).sort_values('file_name').reset_index(drop=True)

In [52]:
recordings_df.head()

Unnamed: 0,file_name,actual_transcription
0,1249120_13842059_104469105.wav,I have a painful cramp in my feet
1,1249120_13842059_105045085.wav,The pain feels like it's right below the skin
2,1249120_13842059_11964685.wav,I feel suicidal.
3,1249120_13842059_12420758.wav,I feel a sharp pain in my ankle joint when I s...
4,1249120_13842059_13041979.wav,My shoulder hurts me so much


In [53]:
load_dotenv()

aws_access_key_id =  os.environ.get('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')

session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

In [54]:
bucket_name = 'capstone-project-audio-files'

s3 = session.client("s3")
result = s3.list_objects(Bucket=bucket_name)
audio_filenames = []

if 'Contents' in result:
    for obj in result['Contents']:
        audio_filenames.append(obj['Key'])
else:
    print('The bucket is empty.')

In [55]:
len(audio_filenames)

500

In [56]:
recordings_df = recordings_df[recordings_df['file_name'].isin(audio_filenames)].sort_values('file_name').reset_index(drop=True)
recordings_df.head()

Unnamed: 0,file_name,actual_transcription
0,1249120_13842059_104469105.wav,I have a painful cramp in my feet
1,1249120_13842059_105045085.wav,The pain feels like it's right below the skin
2,1249120_13842059_11964685.wav,I feel suicidal.
3,1249120_13842059_12420758.wav,I feel a sharp pain in my ankle joint when I s...
4,1249120_13842059_13041979.wav,My shoulder hurts me so much


In [57]:
len(recordings_df)

500

In [12]:
def amazon_transcribe(audio_filename, s3_bucket):

    job_uri =  s3_bucket + audio_filename 
    job_name = (audio_filename.split('.')[0]).replace(" ", "")  
    file_format = audio_filename.split('.')[1]

    transcribe.start_transcription_job(
        TranscriptionJobName=job_name,
        Media={'MediaFileUri': job_uri},
        MediaFormat = file_format,
        LanguageCode='en-US')

    while True:
        result = transcribe.get_transcription_job(TranscriptionJobName=job_name)
        if result['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
            break
        time.sleep(15)
    if result['TranscriptionJob']['TranscriptionJobStatus'] == "COMPLETED":
        data = pd.read_json(result['TranscriptionJob']['Transcript']['TranscriptFileUri'])
        
    return data.at['transcripts', 'results'][0]['transcript']

In [13]:
transcribe = boto3.client(
    'transcribe', 
    aws_access_key_id=aws_access_key_id, 
    aws_secret_access_key=aws_secret_access_key, 
    region_name='us-east-1'
) 

In [14]:
pred_audio_transcripts = []

s3_bucket = f"s3://{bucket_name}/"

for audio_filename in audio_filenames:
    pred_audio_transcripts.append(amazon_transcribe(audio_filename, s3_bucket))

In [15]:
recordings_df['pred_transcription'] = pred_audio_transcripts

In [16]:
recordings_df

Unnamed: 0,file_name,actual_transcription,pred_transcription
0,1249120_13842059_104469105.wav,I have a painful cramp in my feet,I have a painful cramp in my feet.
1,1249120_13842059_105045085.wav,The pain feels like it's right below the skin,The pain feels like it's right below the skin.
2,1249120_13842059_11964685.wav,I feel suicidal.,I feel suicidal.
3,1249120_13842059_12420758.wav,I feel a sharp pain in my ankle joint when I s...,I feel a sharp pain in my ankle joint when I s...
4,1249120_13842059_13041979.wav,My shoulder hurts me so much,My shoulder hurts me so much.
...,...,...,...
495,1249120_6338946_95779210.wav,my ankle is hurting me,My ankle is hurting me.
496,1249120_6338946_97962298.wav,I can't walk because i have a great foot ache,I can't walk because I have a great foot. A.
497,1249120_6338946_98107593.wav,When I tried to be warm and wear more clothes ...,"When I tried to be warm and wear more clothes,..."
498,1249120_6338946_99675257.wav,I've always been very active but now I just do...,I've always been very active but now I just do...


In [17]:
recordings_df.to_csv('./train_recordings_transcription_AWS.csv', index=False)

In [34]:
recordings_df = pd.read_csv('./train_recordings_transcription_AWS.csv')

In [35]:
recordings_df.head()

Unnamed: 0,file_name,actual_transcription,pred_transcription
0,1249120_13842059_104469105.wav,I have a painful cramp in my feet,I have a painful cramp in my feet.
1,1249120_13842059_105045085.wav,The pain feels like it's right below the skin,The pain feels like it's right below the skin.
2,1249120_13842059_11964685.wav,I feel suicidal.,I feel suicidal.
3,1249120_13842059_12420758.wav,I feel a sharp pain in my ankle joint when I s...,I feel a sharp pain in my ankle joint when I s...
4,1249120_13842059_13041979.wav,My shoulder hurts me so much,My shoulder hurts me so much.


In [37]:
def wer(reference, hypothesis):
    """
    Calculate Word Error Rate (WER) between reference and hypothesis.
    """
    reference_words = reference.split()
    hypothesis_words = hypothesis.split()

    # Calculate Levenshtein distance
    distance = Levenshtein.distance(reference_words, hypothesis_words)

    # Calculate WER
    wer = distance / len(reference_words)
    return round(wer,4)


def cer(reference, hypothesis):
    """
    Calculate Character Error Rate (CER) between reference and hypothesis.
    """
    # Calculate Levenshtein distance
    distance = Levenshtein.distance(reference, hypothesis)

    # Calculate CER
    cer = distance / len(reference)
    return round(cer,4)

In [38]:
wer_scores = []
cer_scores = []

for actual, pred in zip(recordings_df['actual_transcription'], recordings_df['pred_transcription']):
    wer_scores.append(wer(actual, pred))
    cer_scores.append(cer(actual, pred))

In [39]:
recordings_df['word_error_rate'] = wer_scores
recordings_df['character_error_rate'] = cer_scores

In [40]:
recordings_df

Unnamed: 0,file_name,actual_transcription,pred_transcription,word_error_rate,character_error_rate
0,1249120_13842059_104469105.wav,I have a painful cramp in my feet,I have a painful cramp in my feet.,0.1250,0.0303
1,1249120_13842059_105045085.wav,The pain feels like it's right below the skin,The pain feels like it's right below the skin.,0.1111,0.0222
2,1249120_13842059_11964685.wav,I feel suicidal.,I feel suicidal.,0.0000,0.0000
3,1249120_13842059_12420758.wav,I feel a sharp pain in my ankle joint when I s...,I feel a sharp pain in my ankle joint when I s...,0.0000,0.0000
4,1249120_13842059_13041979.wav,My shoulder hurts me so much,My shoulder hurts me so much.,0.1667,0.0357
...,...,...,...,...,...
495,1249120_6338946_95779210.wav,my ankle is hurting me,My ankle is hurting me.,0.4000,0.0909
496,1249120_6338946_97962298.wav,I can't walk because i have a great foot ache,I can't walk because I have a great foot. A.,0.3000,0.1333
497,1249120_6338946_98107593.wav,When I tried to be warm and wear more clothes ...,"When I tried to be warm and wear more clothes,...",0.1176,0.0267
498,1249120_6338946_99675257.wav,I've always been very active but now I just do...,I've always been very active but now I just do...,0.0000,0.0000


In [41]:
print('Mean WER:', round(recordings_df['word_error_rate'].mean(), 4))

Mean WER: 0.181


In [42]:
print('Mean CER:', round(recordings_df['character_error_rate'].mean(), 4))

Mean CER: 0.0467


In [43]:
recordings_df.to_csv('./train_recordings_transcription_AWS.csv', index=False)