### Setup Modules & Datasets

In [1]:
!pip install praatio



In [2]:
%cd /content/drive/MyDrive/Audio_SR_22050/
%pwd

/content/drive/MyDrive/Audio_SR_22050


'/content/drive/MyDrive/Audio_SR_22050'

In [3]:
import pandas as pd
import numpy as np
import librosa
import torch

### Importing in Dataset


In [7]:
df_1sec = pd.read_csv("all_spectrify_SR_22050_slen_0_1_clen_1.csv", index_col = 0)
df_1sec['class'] = df_1sec['class'].apply(lambda x: 1 if x == 0 else 0);
df_1sec

Unnamed: 0,correct_filename,ds_type,begin_time,end_time,class,sex,session
0,Train/0171017001_h_00.TextGrid,Train,1.453696,2.463696,1,M,ses1017
1,Train/0171017001_h_00.TextGrid,Train,2.463696,3.563696,1,M,ses1017
2,Train/0171017001_h_00.TextGrid,Train,3.563696,4.763696,1,M,ses1017
3,Train/0171017002_h_00.TextGrid,Train,0.776871,1.796871,1,M,ses1017
4,Train/0171017002_h_00.TextGrid,Train,1.796871,2.936871,1,M,ses1017
...,...,...,...,...,...,...,...
59019,Test/5824078030_h_00.TextGrid,Test,16.660000,17.800000,0,M,ses4078
59020,Test/5824078030_h_00.TextGrid,Test,18.430000,19.460000,0,M,ses4078
59021,Test/5824078030_h_00.TextGrid,Test,20.820000,21.840000,0,M,ses4078
59022,Test/5824078030_h_00.TextGrid,Test,21.840000,22.880000,0,M,ses4078


In [8]:
dict_labels = {"S" : 0, "I" : 1}

In [9]:
def create_df_ds(df_all_maps, ds_type):
   """ Options include "Train", "D1", "Test" & Code for Labels -> {"S" : 0, "I" : 1}
   """
   return df_all_maps[df_all_maps['ds_type'] == ds_type].set_index('correct_filename')

In [10]:
df_1sec_train = create_df_ds(df_1sec, "Train")
print(df_1sec_train.shape)
df_1sec_train.head()

(23483, 6)


Unnamed: 0_level_0,ds_type,begin_time,end_time,class,sex,session
correct_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Train/0171017001_h_00.TextGrid,Train,1.453696,2.463696,1,M,ses1017
Train/0171017001_h_00.TextGrid,Train,2.463696,3.563696,1,M,ses1017
Train/0171017001_h_00.TextGrid,Train,3.563696,4.763696,1,M,ses1017
Train/0171017002_h_00.TextGrid,Train,0.776871,1.796871,1,M,ses1017
Train/0171017002_h_00.TextGrid,Train,1.796871,2.936871,1,M,ses1017


In [11]:
df_1sec_train['class'].value_counts()

0    18116
1     5367
Name: class, dtype: int64

In [12]:
df_1sec_val = create_df_ds(df_1sec, "D1")
print(df_1sec_val.shape)
df_1sec_val.head()

(17389, 6)


Unnamed: 0_level_0,ds_type,begin_time,end_time,class,sex,session
correct_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Val/0261027001_h_01.TextGrid,D1,1.428073,2.508073,1,M,ses1027
Val/0261027001_h_01.TextGrid,D1,2.508073,3.648073,1,M,ses1027
Val/0261027001_h_01.TextGrid,D1,3.648073,4.698073,1,M,ses1027
Val/0261027016_h_00.TextGrid,D1,1.706508,2.776508,1,M,ses1027
Val/0261027016_h_00.TextGrid,D1,2.776508,3.896508,1,M,ses1027


In [13]:
# Finding .wav files in the validation dataset that are intoxicated
df_1sec_val[df_1sec_val['class'] == 1].index.unique()

Index(['Val/0261027001_h_01.TextGrid', 'Val/0261027016_h_00.TextGrid',
       'Val/0261027002_h_00.TextGrid', 'Val/0261027017_h_00.TextGrid',
       'Val/0261027003_h_00.TextGrid', 'Val/0261027018_h_00.TextGrid',
       'Val/0261027019_h_00.TextGrid', 'Val/0261027004_h_00.TextGrid',
       'Val/0261027005_h_00.TextGrid', 'Val/0261027020_h_00.TextGrid',
       ...
       'Val/5963097009_h_00.TextGrid', 'Val/5963097017_h_00.TextGrid',
       'Val/5963097018_h_00.TextGrid', 'Val/5963097010_h_00.TextGrid',
       'Val/5963097019_h_00.TextGrid', 'Val/5963097020_h_00.TextGrid',
       'Val/5963097021_h_00.TextGrid', 'Val/5963097022_h_00.TextGrid',
       'Val/5963097024_h_00.TextGrid', 'Val/5963097025_h_00.TextGrid'],
      dtype='object', name='correct_filename', length=821)

In [14]:
df_1sec_val['class'].value_counts()

0    13043
1     4346
Name: class, dtype: int64

In [15]:
df_1sec_test = create_df_ds(df_1sec, "Test")
print(df_1sec_test.shape)
df_1sec_test.head()

(18152, 6)


Unnamed: 0_level_0,ds_type,begin_time,end_time,class,sex,session
correct_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Test/0151015001_h_00.TextGrid,Test,2.237392,3.287392,1,F,ses1015
Test/0151015001_h_00.TextGrid,Test,3.287392,4.337392,1,F,ses1015
Test/0151015001_h_00.TextGrid,Test,4.337392,5.407392,1,F,ses1015
Test/0151015001_h_00.TextGrid,Test,5.757392,6.787392,1,F,ses1015
Test/0151015002_h_00.TextGrid,Test,1.864331,2.924331,1,F,ses1015


#### Spectrify Class & Size Normalization Function

In [16]:
def equal_specs(input_ar, des_shape):
  """ Since num_rows = num_mels, need to ensure consistent time chunks or equal num cols
  """
  if input_ar.shape[1] > des_shape[1]:
    input_ar = input_ar[:, :des_shape[1]]
  elif input_ar.shape[1] < des_shape[1]:
    # First tuple assigns padding along rows, which is not required
    # Second tuple assigns padding along columns, which is needed to reach 264 columns
    pad_width = [(0, 0), (0, des_shape[1] - input_ar.shape[1])]
    input_ar =  np.pad(input_ar, pad_width, mode='constant', constant_values=0)
  return input_ar

In [17]:
import librosa
from praatio import textgrid

#Define Spectrify class with parameters
class Spectrify:
    def __init__(self, fmin, fmax, nmels, hop_length, n_fft, silence_len, chunk_len, des_shape, nml_tech01):
        self.fmin = fmin
        self.fmax = fmax
        self.nmels = nmels
        self.hop_length = hop_length
        self.n_fft = n_fft
        self.silence_len = silence_len
        self.chunk_len = chunk_len
        self.desired_shape = des_shape # Tuple of required shape
        self.normal_tech_01 = nml_tech01


    # Define planner which extracts start and end times for each interval
    def planner(self, filename):
        tg = textgrid.openTextgrid(filename, False)
        entries = tg.tiers[0].entries
        entries = [(start, end, label) for start, end, label in entries]
        return self.phraser(entries, filename)

    # Define phraser, which creates target-len chunks that do not contain silence exceeding specified silence_len
    def phraser(self, entries, filename):
        phrases = []
        phrase_duration = 0
        current_phrase = []

        for start, end, label in entries:
            duration = end - start
            if label == "<p:>" and duration > self.silence_len: # pause
              current_phrase = []
              phrase_duration = 0
            # From original ALC, these are noise: ["<\"ah>", "<hm>", "<\"ahm>", "<hes>", "[sta]", "[int]", "[spk]", "<P>", "<PP>"]
            # In my copy, just need to make sure "<usb>" is counted as noise
            elif label == "<usb>": # noise (specific to KRAJ ALC Version), can customize to your TextGrids' values
              current_phrase = []
              phrase_duration = 0
            else: # Phoneme detected
              phrase_duration += duration # phoneme added to it
              current_phrase.append((start, end, label))

              if phrase_duration >= self.chunk_len: # See if accumulated phonemes exceed limit
                phrases.append(current_phrase)
                current_phrase = [] # Reset current phrase and try other potential phrases in textgrid
                phrase_duration = 0

        return phrases

    # Return spectrogram for chunk specified by parameters
    def spectrify(self, filename, beginning, end):
        filename = filename.replace("TextGrid", "wav")
        length = end - beginning
        y, sr = librosa.load(filename, offset=beginning, duration=length, sr = 22_050)

        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=self.nmels, n_fft=self.n_fft, hop_length=self.hop_length,
                                               fmin=self.fmin, fmax=self.fmax)
        S_db = librosa.power_to_db(S)
        if self.normal_tech_01 == True:
          normalized_spec = ((S_db - np.min(S_db)) / (np.max(S_db) - np.min(S_db))) # normalizing between 0 and 1
        else:
          normalized_spec = 2*((S_db - np.min(S_db)) / (np.max(S_db) - np.min(S_db))) + -1 # normalizing between -1 and 1
        normalized_spec = equal_specs(normalized_spec, self.desired_shape) # normalizing size of retrieved chunks
        return normalized_spec

#### Required functions for Speech File Classification

Creating Gender-Specific Spectrify Object

In [18]:
# Optimized Male & Female Spectrogram Creation Parameters
dict_male_optimhp = pd.read_csv("male_optimhp.csv", index_col = 0).to_dict('records')[0]
dict_female_optimhp = pd.read_csv("female_optimhp.csv", index_col = 0).to_dict('records')[0]

In [20]:
# Creating Spectrify Obj Dependent on Whether User is Male/Female:
def create_spectrify_obj(Gender):
  # Unique Spectrogram Generation hyperparameters for Male & Female
  if Gender == "M":
    spec_params = dict_male_optimhp
  else:
    spec_params = dict_female_optimhp

  spec_params['des_shape'] = (spec_params['nmels'], int(22_050 / spec_params['hop_length']) + 1)
  spectrify_obj = Spectrify(fmin=spec_params['fmin'], fmax=spec_params['fmax'], nmels= spec_params['nmels'],
                              hop_length= spec_params['hop_length'], n_fft=spec_params['n_fft'], silence_len=spec_params['silence_len'],
                          chunk_len= spec_params['chunk_len'], des_shape = spec_params['des_shape'], nml_tech01= spec_params['normal_tech_01'])
  return spectrify_obj

Creating DataFrame Mapping Locations of Desired Chunks Within Audio File

In [21]:
def create_df_chunk_mapping(file_str, spect_obj):
  """ Expecting an input Textgrid filename string & a male/female spectrify obj
  """
  list_chunks = []
  #for file_str in df_ds.index:
  phrases = spect_obj.planner(file_str)
  if len(phrases) > 0:
    for phrase in phrases:
      begin_pt, end_pt = phrase[0][0], phrase[-1][1]
      list_chunks.append((file_str, begin_pt, end_pt))
  df_chunks_ds = pd.DataFrame(list_chunks, columns = ['filename', 'begin_time', 'end_time'])
  return df_chunks_ds

###### Audio Dataset Class

In [22]:
def cnn_reshape(input_arr):
  return input_arr.reshape(1, input_arr.shape[0], input_arr.shape[1])

In [23]:
from torch.utils.data import Dataset

class Audio_DS(Dataset):
    def __init__(self, data, spectrify_obj, device = "cuda"):

        self.df_mapping = data # Loading in dataframe of filenames as index and associated class
        self.device = device
        self.spectrify_obj = spectrify_obj # Using predefined spectrify object with audio parameters
        self.file_strs = self.df_mapping['filename'].values
        self.begin_pts = self.df_mapping['begin_time'].values
        self.end_pts = self.df_mapping['end_time'].values

    def __len__(self):
      # index is derived from Weighted Random Sampler which is using max number as number of chunks
      return len(self.file_strs)

    def __getitem__(self, idx):
      chunk_filename = self.file_strs[idx]
      chunk_start_pt = self.begin_pts[idx]
      chunk_end_pt = self.end_pts[idx]
      # Transform filename input into spectrogram & convert spectrogram shape into shape (num_channels, img_height, img_width)
      arr_spec = self.spectrify_obj.spectrify(chunk_filename, chunk_start_pt, chunk_end_pt) # Creating Standardized spectrogram array
      arr_spec = cnn_reshape(arr_spec) # Transform into (num_channels, img_height, img_width)

      return arr_spec

Generating Chunk Class Predictions & Associated Probabilities of being Intoxicated (Class: 1)

In [24]:
from torch.utils.data import DataLoader
# Val Datasets & DataLoaders
def gen_chunk_preds(spect_obj, df_val, device, model):
  """ Input is Male/Female Spectrify Obj, dataframe of chunk time mappings, device: CPU or GPU, Male/Female Model
      Output is a dictionary with key of preds: list of chunk class preds (0 or 1), and key of probs: list of chunk pseudo probs outputted by sigmoid function
  """
  val_ds = Audio_DS(df_val,spect_obj, device = device)
  val_data_loader = DataLoader(val_ds, 32, shuffle = False, num_workers = 2, prefetch_factor= 4, drop_last = False)
  # Setting model to eval to switch off dropout
  all_preds = []
  all_probs = []
  dict_probs_preds = {}
  model.eval()
  for batch in val_data_loader:
    with torch.no_grad():
      batch_logits = model(batch)
      batch_probs = torch.squeeze(torch.sigmoid(batch_logits))
      batch_preds = (batch_probs >= 0.5).float()

    if torch.numel(batch_preds) == 1: # In case only 1 chunk can be extracted from audio file
        all_preds.append(batch_preds.item())
        all_probs.append(batch_probs.item())
    else:                             # If more than 1 chunk can be extracted from audio file
        all_preds.extend(batch_preds.tolist())
        all_probs.extend(batch_probs.tolist())
  dict_probs_preds['preds'] = np.array(all_preds)
  dict_probs_preds['probs'] = np.array(all_probs)
  return dict_probs_preds


Aggregating Chunk Predictions together to get File Prediction & Mean Probabilitiy if Predicted to be Intoxicated (Class: 1)

In [25]:
def tg_class_pred(dict_preds_probs):
  """ Returns predicted class for a given textgrid file based on chunk class votes outputted by CNN
      Also, returns pseudo probabilities if file prediction is intoxicated (1)
      Labeling_scheme: 0 -> Sober, 1 -> Intox"""

  unique_vals, unique_counts = np.unique(dict_preds_probs['preds'], return_counts = True)
  if len(unique_vals) == 2: # If there is at least 1 vote for both sober and intoxicated classes
    num_zeros, num_ones = unique_counts[0], unique_counts[1]
    if num_zeros > num_ones: # More sober votes than intox votes, so pred = sober
      pred = 0
      return pred
    elif num_ones == num_zeros: # Equal sober and intox votes, have to side with caution so pred = intox
      pred = 1
      all_probs_intoxicated = dict_preds_probs['probs'][np.where(dict_preds_probs['preds'] == 1)[0]]
      average_prob_intoxicated = np.mean(all_probs_intoxicated)
      return pred, average_prob_intoxicated # Return prediction of intox and average probability of it
    else: # Final case of more intox than sober votes, pred = intox
      pred = 1
      all_probs_intoxicated = dict_preds_probs['probs'][np.where(dict_preds_probs['preds'] == 1)[0]]
      average_prob_intoxicated = np.mean(all_probs_intoxicated)
      return pred, average_prob_intoxicated # Return prediction of intox and average probability of it
  elif len(unique_vals) == 1: # If there are only votes/vote for one of the 2 classes: sober or intoxicated
    pred = unique_vals[0] # Capture predicted class for chunk/chunks
    if pred == 1: # If prediction is intoxicated
      all_probs_intoxicated = np.mean(dict_preds_probs['probs'])
      return pred, all_probs_intoxicated # Return pred of 1 & probability of intoxicated (if singular chunk); average if >1 chunk
    else:
      return pred # Return sober (0) if sole chunk outputted sober (0)

### Example Run with Female Model & All Outputs to Illustrate Process

### All Required Functions Placed into Class & Dynamically Accounting for Gender

In [26]:
import random
import torch
class Speech_Classify():
  def __init__(self, input_entry):
    self.filename = input_entry['correct_filename'] # Getting random filename to predict class (Intox or Sober)
    print("Filename:", self.filename)
    self.class_label = input_entry['class'] # Getting actual class label {0 : Sober , 1 : Intox}
    print("Class", self.class_label)
    self.gender = input_entry['sex'] # Getting gender to instantiate correct Spectrify Object
    print("Gender:", self.gender)
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Getting device to use
    if self.gender == "M":
      self.spec_obj = create_spectrify_obj(self.gender)
      self.model = torch.jit.load("11_1_Male_CNN_model.pt")
    else:
      self.spec_obj = create_spectrify_obj("Female")
      self.model = torch.jit.load("11_1_Fem_CNN_model.pt")

  def speech_classifier(self):
    df_chunk_mapping = create_df_chunk_mapping(self.filename, self.spec_obj) # Extract chunk mapping of chunks meeting minimum chunk time length and adhering to silence tolerance
    dict_chunk_predobs = gen_chunk_preds(self.spec_obj, df_chunk_mapping, self.device, self.model) # Creating pred for each extracted chunk and associated prob

    # Chunk predictions made by seeing if sigmoid probabilities >= prob threshold of 0.5
    print("All chunk predictions from file:", dict_chunk_predobs['preds'])
    # All probabilities should be interpreted as probability of being intoxicated
    print("All chunk probs of Intoxicated:", dict_chunk_predobs['probs'])


    # Speech File Classification
    tuple_spclass = tg_class_pred(dict_chunk_predobs) # Could by simply predicted class or predicted class and probability if predicted intox
    if tuple_spclass == 0:
      pred_class = tuple_spclass
      print(f"Speech File Predicted Class: {pred_class}")
      print("Speech File Actual Class: ", self.class_label)
      return pred_class
    else:
      pred_class, prob = tuple_spclass
      print(f"Speech File Predicted Class: {pred_class}")
      print(f"Speech File Classification Probability:  {prob}")
      print("Speech File Actual Class: ", self.class_label)
      return pred_class, prob


In [29]:
df_1sec_sample = df_1sec.sample(1).to_dict('records')[0]
desired_keys = ['correct_filename', 'class', 'sex']
sample_inputs = {key: df_1sec_sample[key] for key in desired_keys}
sample_inputs

{'correct_filename': 'Train/0172032034_h_00.TextGrid', 'class': 0, 'sex': 'M'}

#### Workflow Explained Step By Step
1. Extract chunk mappings from pre-generated TextGrid which is tied to corresponding speech .wav file sampled at 22,050 HZ
2. Load in gender-specific model based on user's gender (Male or Female)
3. Extract chunks from speech .wav file and predict (0 or 1) for each extracted chunk
>  S maps to 0 and I maps to 1
4. Aggregate speech file's chunks' predictions to do entire speech file class (0 or 1) prediction
5. Output the speech file's class and confidence metric if predicted 1
> Confidence metric is the average of probabilities of chunks who are predicted to be 1

In [30]:
speeclfy_obj = Speech_Classify(sample_inputs)
tup_cl_probs = speeclfy_obj.speech_classifier()

Filename: Train/0172032034_h_00.TextGrid
Class 0
Gender: M
All chunk predictions from file: [1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
All chunk probs of Intoxicated: [0.52793592 0.20674776 0.31631657 0.35813221 0.37376326 0.43711048
 0.41836074 0.31834885 0.52602804 0.35708904 0.43483189 0.43239778
 0.38339078 0.33766201 0.43040973 0.3792845  0.46156368 0.4724001
 0.45299643 0.41387591 0.40708807 0.49490446 0.54374146 0.41413563
 0.29497907 0.45925829 0.44147131 0.35552368 0.34145036 0.35940605
 0.44754148 0.26919195 0.381832   0.30393761 0.42584956]
Speech File Predicted Class: 0
Speech File Actual Class:  0
