In [18]:
import pandas as pd
import os
import shutil

from sklearn.preprocessing import LabelEncoder
import joblib

In [3]:
train_path = '../data/recordings/train'
test_path = '../data/recordings/test'
validate_path = '../data/recordings/validation'

train_files = [f for f in os.listdir(train_path) if f.endswith('.wav') or f.endswith('.mp3')]  
test_files = [f for f in os.listdir(test_path) if f.endswith('.wav') or f.endswith('.mp3')]
validate_files = [f for f in os.listdir(validate_path) if f.endswith('.wav') or f.endswith('.mp3')]

##### Class consolidation

Here we are consolidating existing classes to group similar medical conditions together. In the end, we have 6 grouped classes for our use-case

In [4]:
# Medical intent groups
medical_groups = {
    'Wound Trauma': ['infected wound', 'injury from sports', 'open wound'],
    'Musculoskeletal Pain': ['shoulder pain', 'knee pain', 'joint pain', 'muscle pain'],
    'Dermatological Issues': ['hair falling out', 'skin issue', 'acne'],
    'Respiratory and Ear Issues': ['hard to breath', 'cough', 'ear ache', 'feeling cold'],
    'Head and Neck Discomfort': ['head ache', 'blurry vision', 'feeling dizzy', 'neck pain'],
    'Abdominal Pain and Weakness': ['stomach ache', 'body feels weak', 'internal pain', 'heart hurts']
}

In [5]:
# Function to map symptoms to their medical intent groups
def map_medical_intent(symptom):
    for group, symptoms_list in medical_groups.items():
        if symptom in symptoms_list:
            return group
    return None

In [6]:
overview = pd.read_csv('overview-of-recordings.csv')
overview['prompt'] = overview['prompt'].str.lower()

In [7]:
# Map symptoms to their medical intent groups
overview['medical intent'] = overview['prompt'].apply(map_medical_intent)

# Drop rows where medical intent is None (not in any group)
overview.dropna(subset=['medical intent'], inplace=True)

# Display the result
overview[['prompt', 'medical intent']].head()

Unnamed: 0,prompt,medical intent
1,hair falling out,Dermatological Issues
2,heart hurts,Abdominal Pain and Weakness
3,infected wound,Wound Trauma
4,infected wound,Wound Trauma
6,shoulder pain,Musculoskeletal Pain


In [8]:
train_df = overview[overview['file_name'].isin(train_files)]
test_df = overview[overview['file_name'].isin(test_files)]
validation_df = overview[overview['file_name'].isin(validate_files)]

In [9]:
train_df[['file_name', 'medical intent', 'phrase']].to_csv('train_metadata.csv', index=False)
validation_df[['file_name', 'medical intent', 'phrase']].to_csv('validation_metadata.csv', index=False)
test_df[['file_name', 'medical intent', 'phrase']].to_csv('test_metadata.csv', index=False)

In [10]:
print(len(train_df), len(test_df), len(validation_df))

5260 345 342


##### Dump files that won't be used

In [11]:
def dump_files(df, dump_folder, folder_path):
    
    # Ensure the dump folder exists
    os.makedirs(dump_folder, exist_ok=True)

    # List all files in the directory
    files_in_folder = os.listdir(folder_path)

    # Extract file names present in the DataFrame
    files_in_df = df['file_name'].tolist()

    # Files not present in the DataFrame
    files_to_dump = [file for file in files_in_folder if file not in files_in_df]

    # Move files to the dump folder
    for file in files_to_dump:
        file_path = os.path.join(folder_path, file)
        dump_path = os.path.join(dump_folder, file)
        shutil.move(file_path, dump_path)

In [12]:
dump_files(train_df, 'train_dump', train_path)
dump_files(test_df, 'test_dump', test_path)
dump_files(validation_df, 'validation_dump', validate_path)

Label Encoding

In [19]:
train_metadata = pd.read_csv('train_metadata.csv')
validation_metadata = pd.read_csv('validation_metadata.csv')
test_metadata = pd.read_csv('test_metadata.csv')

In [20]:
label_encoder = LabelEncoder()
train_metadata['label'] = label_encoder.fit_transform(train_metadata['medical intent'])
test_metadata['label'] = label_encoder.transform(test_metadata['medical intent'])
validation_metadata['label'] = label_encoder.transform(validation_metadata['medical intent'])

In [21]:
joblib.dump(label_encoder, '../metadata/label_encoder.pkl')

['../metadata/label_encoder.pkl']

In [22]:
train_metadata.to_csv('train_metadata.csv', index=False)
validation_metadata.to_csv('validation_metadata.csv', index=False)
test_metadata.to_csv('test_metadata.csv', index=False)