# Create features from all wave files in PC - GITA. 
This is the create_features_from_PC-GITA.py functionality + some more 
Used for note taking. 

- We get all data by going trough the folders. --> Saves all features as a pandas dataframe to csv files with the same structure as PC-GITA.
    - Could also have done it by making a list or file that shows the structure and then used that to iterate through. 

In [1]:
%run project_setup.py 
import pandas as pd
import os

from disvoice.articulation.articulation import Articulation
from disvoice.phonation.phonation import Phonation
from disvoice.prosody.prosody import Prosody

from constants import path_PC_GITA_16k,  personal_path_to_code


2023-11-18 19:09:48.550041: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [26]:
# List of all HC and PD words --> might be needed when I am fetching features from the files created. 
HC = ["HC", "hc", "Control"] # Only need to have HC in the name, we have: HC_las_que_sobraron as well
PD = ["PD", "pd", "Patologica", "Patalogica"] # Only need to have PD in the name, we have: PD_las_que_sobraron as well

phonationf=Phonation()
articulationf=Articulation()
prosodyf=Prosody()

## Functionality to get all of our last folders in the tree (not used, but nice to have)

In [27]:
def get_last_level_folders(path):
    """This function is a combination of functionality created by generative AI (chat GPT) and self modified code. 

    Args:
        path (path): path to folder you want to find the relative path to the last folders from. 

    Returns:
        list: list of all relative paths in path that shows folders in the end of the tree. 
    """
    result = []

    for dirpath, dirnames, filenames in os.walk(path):
        if filenames != [] and filenames != ['.DS_Store']:
            rel_path = os.path.relpath(dirpath, path)
            result.append(rel_path)

    return result


last_level_relative_paths = get_last_level_folders(path_PC_GITA_16k)
print(len(last_level_relative_paths))

121


## Copy folder structure from PC-GITA into empty folders: 

In [27]:
def create_new_folder(path, name):
    """
    Creates a new folder with "name" in at "path" location if there is not already a folder with that name. 

    Args:
        path (str): path to where we want the new folder
        name (str): name of the new folder

    Returns:
        str: path to the new folder created. Or False if such a folder already exists. 
    """
    new_folder_path = os.path.join(path, name)
    if os.path.exists(new_folder_path):
        print(f"Already have a {name} folder in this location. Remove it if you want a new created. ")
        new_folder_path = False
    else:
        os.makedirs(new_folder_path)
    return new_folder_path

def copy_folder_structure(source_folder, new_folder):
    """
    Creates corresponding directories in the new_folder as we have in the source_folder without
    copying the files. 
    Args:
        source_folder (str): the path to the folder we want to copy from. 
        new_folder (str): the path to the folder we want to copy to. 
    """
    for dirpath, dirnames, filenames in os.walk(source_folder):
        for dirname in dirnames:
            source_path = os.path.join(dirpath, dirname)
            new_path = os.path.join(new_folder, os.path.relpath(source_path, source_folder))
            os.makedirs(new_path, exist_ok=True)

PC_GITA_features = create_new_folder(personal_path_to_code, "PC-GITA-features")
if PC_GITA_features:     
    copy_folder_structure(path_PC_GITA_16k, PC_GITA_features)

### Code to create prosody feature using all the waveform files seperatly with extract_feature_file (not actually used)

This code can be used if we want every wav file's features to be in a seperate csv file in the folder. Could be advantage to do so 
when looking into what wav files where wrongly by listening to them. 

In [11]:
# Create path for path 

def add_feature_content(waveform_folder, feature_folder):
    """
    Uses DisVoice to create Prosody features from all waveforms in 
    waveform_folder, and saving them with the same structure in feature_folder. 
    This function is not used, but the logic in it for saving new values to 
    a dictionary is nice so I keep it here. 

    Args:
        waveform_folder (str): path to folder where we have the waveforms we want to extract features from
        feature_folder (str): path to folder where we want the features located. Needs to have same structure as waveform_folder. 
    """
    for dirpath, dirnames, filenames in os.walk(waveform_folder):
        if filenames != [] and filenames != ['.DS_Store']:
            rel_path = os.path.relpath(dirpath, waveform_folder)
            waveform_path = os.path.join(waveform_folder, rel_path)
            feature_path = os.path.join(feature_folder, rel_path)
            
            articulation_file = os.path.join(feature_path, "Articulation.csv")
            
            prosody_features = pd.DataFrame()
            for dirpath, dirnames, filenames in os.walk(waveform_path):
                for filename in filenames:
                    filepath = os.path.join(waveform_path, filename)
                    features1=prosodyf.extract_features_file(filepath, static=True, plots=False, fmt="csv")
                    
                    if prosody_features.empty: # Add the first row with headers
                        prosody_features = pd.DataFrame(features1)
                    else:
                        prosody_features.loc[len(prosody_features.index)] = features1.values[0]
            prosody_features.to_csv(articulation_file, index=False)

### Code to create all features using all the folder of waveforms and extract_feature_path

In [18]:

# Create folder for folder

def add_feature_content(waveform_folder, feature_folder):
    """
    Uses DisVoice to create Articulation, Prosody and Phonation features from all waveforms in 
    waveform_folder, and saving them with the same structure in feature_folder. 

    Args:
        waveform_folder (str): path to folder where we have the waveforms we want to extract features from
        feature_folder (str): path to folder where we want the features located. Needs to have same structure as waveform_folder. 
    """
    for dirpath, dirnames, filenames in os.walk(waveform_folder):
        if filenames != [] and filenames != ['.DS_Store']:
            rel_path = os.path.relpath(dirpath, waveform_folder)
            waveform_path = os.path.join(waveform_folder, rel_path) + "/"
            print("Working on files in ", waveform_path, " ...")
            feature_path = os.path.join(feature_folder, rel_path)
            
            phonation_file = os.path.join(feature_path, "Phonation.csv")
            phonation_features=phonationf.extract_features_path(waveform_path, static=True, plots=False, fmt="csv")
            phonation_features.to_csv(phonation_file, index=False)
            
            articulation_file = os.path.join(feature_path, "Articulation.csv")
            articulation_features=articulationf.extract_features_path(waveform_path, static=True, plots=False, fmt="csv")
            articulation_features.to_csv(articulation_file, index=False)       
            
            prosody_file = os.path.join(feature_path, "Prosody.csv")
            prosody_features=prosodyf.extract_features_path(waveform_path, static=True, plots=False, fmt="csv")
            prosody_features.to_csv(prosody_file, index=False)
            
add_feature_content(path_PC_GITA_16k, PC_GITA_features)

/localhome/studenter/malinre/project-thesis/PC-GITA-v2/PC-GITA_per_task_16000Hz/Vowels/Control/A/


Processing AVPEPUDEAC0057_a3.wav: 100%|██████████| 150/150 [00:02<00:00, 70.52it/s]


/localhome/studenter/malinre/project-thesis/PC-GITA-v2/PC-GITA_per_task_16000Hz/Vowels/Control/E/


Processing AVPEPUDEAC0057_e3.wav: 100%|██████████| 150/150 [00:02<00:00, 54.52it/s]


/localhome/studenter/malinre/project-thesis/PC-GITA-v2/PC-GITA_per_task_16000Hz/Vowels/Control/I/


Processing AVPEPUDEAC0057_i3.wav: 100%|██████████| 150/150 [00:02<00:00, 52.20it/s]


/localhome/studenter/malinre/project-thesis/PC-GITA-v2/PC-GITA_per_task_16000Hz/Vowels/Control/O/


Processing AVPEPUDEAC0057_o3.wav: 100%|██████████| 150/150 [00:02<00:00, 59.00it/s]


/localhome/studenter/malinre/project-thesis/PC-GITA-v2/PC-GITA_per_task_16000Hz/Vowels/Control/U/


Processing AVPEPUDEAC0057_u3.wav: 100%|██████████| 150/150 [00:02<00:00, 59.18it/s]


/localhome/studenter/malinre/project-thesis/PC-GITA-v2/PC-GITA_per_task_16000Hz/Vowels/Patologicas/A/


Processing AVPEPUDEA0059_a3.wav: 100%|██████████| 150/150 [00:03<00:00, 48.98it/s]


/localhome/studenter/malinre/project-thesis/PC-GITA-v2/PC-GITA_per_task_16000Hz/Vowels/Patologicas/E/


Processing AVPEPUDEA0059_e3.wav: 100%|██████████| 150/150 [00:02<00:00, 50.08it/s]


/localhome/studenter/malinre/project-thesis/PC-GITA-v2/PC-GITA_per_task_16000Hz/Vowels/Patologicas/I/


Processing AVPEPUDEA0059_i3.wav: 100%|██████████| 150/150 [00:03<00:00, 45.55it/s]


/localhome/studenter/malinre/project-thesis/PC-GITA-v2/PC-GITA_per_task_16000Hz/Vowels/Patologicas/O/


Processing AVPEPUDEA0059_o3.wav: 100%|██████████| 150/150 [00:03<00:00, 47.90it/s]


/localhome/studenter/malinre/project-thesis/PC-GITA-v2/PC-GITA_per_task_16000Hz/Vowels/Patologicas/U/


Processing AVPEPUDEA0059_u3.wav: 100%|██████████| 150/150 [00:02<00:00, 54.47it/s]
