Practice Loading from gcs to debug

In [3]:
# Import
import pandas as pd
import io
import numpy as np
import sys
import json
import torch
import torchaudio
from google.cloud import storage, bigquery
from utilities.ssast_utils import *
from utilities.speech_utils import *

In [4]:
torchaudio.__version__


'0.12.1+cu102'

In [12]:
torch.__version__

'1.12.1+cu102'

In [7]:
# First, load data from google storage bucket

project_name = 'ml-mps-aif-afdgpet01-p-6827'
study = 'speech_poc_freeze_1'
bucket_name = 'ml-e107-phi-shared-aif-us-p'
gcs_prefix = f'speech_ai/speech_lake/{study}'

storage_client = storage.Client(project=project_name)
bq_client = bigquery.Client(project=project_name)
bucket = storage_client.bucket(bucket_name)

file_list=[]
for blob in storage_client.list_blobs(bucket_name, prefix='speech_ai/speech_lake/speech_poc_freeze_1'):
    file_list.append(blob.name)

    extensions=[f.split('.')[-1] for f in file_list]

data_split_root = 'gs://ml-e107-phi-shared-aif-us-p/speech_ai/share/data_splits/amr_subject_dedup_594_train_100_test_binarized_v20220620'
gcs_train_path = f'{data_split_root}/train.csv'
gcs_test_path = f'{data_split_root}/test.csv'

In [None]:
file_list

In [4]:
# (1) load the train and test files to a df
train_df = pd.read_csv(gcs_train_path, index_col = 'uid')
test_df = pd.read_csv(gcs_test_path, index_col = 'uid')

# (2) alter columns as necessary 
train_df["distortions"]=((train_df["distorted Cs"]+train_df["distorted V"])>0).astype(int)
test_df["distortions"]=((test_df["distorted Cs"]+test_df["distorted V"])>0).astype(int)

# (3) define target labels
target_labels=['breathy',
             'loudness decay',
             'slow rate',
             'high pitch',
             'hoarse / harsh',
             'irregular artic breakdowns',
             'rapid rate',
             'reduced OA loudness',
             'abn pitch variability',
             'strained',
             'hypernasal',
             'abn loudness variability',
              'distortions']

# (4) select only the target labels from train and test df
train_df=train_df[target_labels]
test_df=train_df[target_labels]

In [6]:
train_df.head()

Unnamed: 0_level_0,breathy,loudness decay,slow rate,high pitch,hoarse / harsh,irregular artic breakdowns,rapid rate,reduced OA loudness,abn pitch variability,strained,hypernasal,abn loudness variability,distortions
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
4d28f730-5814-48e1-bc29-3c0bf562e2fb,0,0,1,0,1,0,0,0,0,1,0,0,0
1e2dedd0-4f93-42ee-b0fb-c77fb7ba4cf4,0,0,1,0,0,0,0,0,1,0,0,1,0
f31c13e4-9f49-411e-b59f-f692244fb740,1,1,0,0,1,0,0,0,0,1,1,0,1
d917de91-c421-40bf-9d75-a0b5b0736c5b,0,0,0,0,0,0,1,1,0,0,0,0,1
9c4a9e77-3080-4591-8797-d712e42d6ed6,1,0,0,0,1,0,0,0,0,1,0,0,1


In [7]:
# (5) prep the data
prep_ssast_data(train_df,target_labels,'train_ssast',create_label_csv=True)
prep_ssast_data(test_df,target_labels,'test_ssast')

In [None]:
def load_waveform_from_gcs(bucket, gcs_prefix, uid, extension = 'mp3'):
    
    try:
        gcs_waveform_path = f'{gcs_prefix}/{uid}/waveform.{extension}'
        blob = bucket.blob(gcs_waveform_path)
        wave_string = blob.download_as_string()
        wave_bytes = io.BytesIO(wave_string)
    except:
        gcs_waveform_path = f'{gcs_prefix}/{uid}/waveform.wav'
        extension='wav'
        blob = bucket.blob(gcs_waveform_path)
        wave_string = blob.download_as_string()
        wave_bytes = io.BytesIO(wave_string)
    gcs_metadata_path = f'{gcs_prefix}/{uid}/metadata.json'
    
    waveform, _ = torchaudio.load(wave_bytes, format = extension)
    
    metadata_blob = bucket.blob(gcs_metadata_path)
    metadata = json.loads(metadata_blob.download_as_string())
    
    return waveform, metadata

In [8]:

extension = 'mp3'
#gcs_waveform_path = f'{gcs_prefix}/{uid}/waveform.{extension}'
dataset_json_file = 'train_ssast.json'
with open(dataset_json_file, 'r') as fp:
    data_json = json.load(fp)

data = data_json['data']
uid = data[1]['wav']

In [9]:
gcs_waveform_path = f'{gcs_prefix}/{uid}/waveform.{extension}'
gcs_waveform_path
blob = bucket.blob(gcs_waveform_path)
blob.download_to_filename('train_audio1.mp3')

In [10]:
wave_string = blob.download_as_string()
wave_bytes = io.BytesIO(wave_string)

In [11]:
waveform, _ = torchaudio.load('train_audio1.mp3', format = extension)


RuntimeError: Failed to load audio from train_audio1.mp3

In [21]:
import librosa

In [26]:
wav, _ = librosa.load(wave_bytes)

In [27]:
wav

array([-0.00394658, -0.00651379, -0.00663843, ...,  0.01701407,
        0.01762257,  0.01988065], dtype=float32)