In [1]:
import pandas as pd
from overlap_functions import overlap, overlap_features

In [2]:
# path to the ri data
ri_data_path = "../data/reference_data/"

# read in each of the ri datasets - add as needed
clac_female_ris = pd.read_csv(ri_data_path + "clac_female_reference_intervals.csv", index_col=0).iloc[:, 0:3]
clac_male_ris = pd.read_csv(ri_data_path + "clac_male_reference_intervals.csv",index_col=0).iloc[:, 0:3]

common_female_ris = pd.read_csv(ri_data_path + "common_female_reference_intervals.csv",index_col=0).iloc[:, 0:3]
common_male_ris = pd.read_csv(ri_data_path + "common_male_reference_intervals.csv",index_col=0).iloc[:, 0:3]

In [3]:
# ensure case is consistent
clac_female_ris['feature'] = clac_female_ris['feature'].str.lower()
clac_male_ris['feature'] = clac_male_ris['feature'].str.lower()

common_female_ris['feature'] = common_female_ris['feature'].str.lower()
common_male_ris['feature'] = common_male_ris['feature'].str.lower()

In [4]:
# should contain three columns: feature; RI_lower_limit; RI_upper_limit
clac_female_ris.head()

Unnamed: 0,feature,RI_lower_limit,RI_upper_limit
0,speaking_rate,-2.180159,1.75688
1,articulation_rate,-2.061043,2.125897
2,phonation_ratio,-2.372067,1.778188
3,pause_rate,-2.130281,2.17644
4,mean_pause_duration,-3.362959,1.750304


In [5]:
# features to find overlap for; all lowercase
speech_features = ['speaking_rate', 'mean_f0', 'cepstral_peak_prominence', 'mean_f1_loc', 'spectral_gravity']

In [6]:
# comparison dicts contain a key with the naem of the comparison column, with the value as a list of the TWO dfs to generate the comparison for.
# add an extra key value pair in this pattern if there are more datasets to compare. 

# female
female_comparison_dict = {"clac-cv": [clac_female_ris, common_female_ris]}
female_comparison_list = [i for i in female_comparison_dict.keys()]

# male
male_comparison_dict = {"clac-cv": [clac_male_ris, common_male_ris]}
male_comparison_list = [i for i in male_comparison_dict.keys()]


In [7]:
# empty dfs
female_overlap = pd.DataFrame({"feature": speech_features})
male_overlap = pd.DataFrame({"feature": speech_features})

In [8]:
# calculate the overlaps for each of the comparison pairs
for comparison in female_comparison_dict: 
    female_overlap = overlap_features(female_comparison_dict[comparison][0], female_comparison_dict[comparison][1], speech_features, female_overlap, comparison)

for comparison in male_comparison_dict: 
    male_overlap = overlap_features(male_comparison_dict[comparison][0], male_comparison_dict[comparison][1], speech_features, male_overlap, comparison)

In [9]:
# an additional row with the mean overlap of each comparison pair

# define the empty row
male_summary_row = pd.DataFrame({"feature": ["mean"]})
female_summary_row = pd.DataFrame({"feature": ["mean"]})

# add the means
for i in male_comparison_list:
    male_summary_row[i] = male_overlap[i].mean(axis=0)

for i in female_comparison_list:
    female_summary_row[i] = female_overlap[i].mean(axis=0)

In [10]:
male_summary_row

Unnamed: 0,feature,clac-cv
0,mean,93.2


In [11]:
# add the summary row and define the mean of each of the features
male_overlap = pd.concat([male_overlap, male_summary_row])
male_overlap["feature_mean"] = round(male_overlap[male_comparison_list].mean(axis=1), 1)

female_overlap = pd.concat([female_overlap, female_summary_row])
female_overlap["feature_mean"] = round(female_overlap[female_comparison_list].mean(axis=1), 1)

# Results

In [12]:
female_overlap

Unnamed: 0,feature,clac-cv,feature_mean
0,speaking_rate,95.7,95.7
1,mean_f0,91.8,91.8
2,cepstral_peak_prominence,94.2,94.2
3,mean_f1_loc,84.5,84.5
4,spectral_gravity,82.4,82.4
0,mean,89.72,89.7


In [13]:
male_overlap

Unnamed: 0,feature,clac-cv,feature_mean
0,speaking_rate,94.2,94.2
1,mean_f0,93.9,93.9
2,cepstral_peak_prominence,98.6,98.6
3,mean_f1_loc,90.6,90.6
4,spectral_gravity,88.7,88.7
0,mean,93.2,93.2
