# Imports and Functions

### Data source:

*common-voice2 - Kaggle*
<br>
*https://www.kaggle.com/datasets/danielgraham1997/commonvoice2*

## Libraries

In [1]:
import warnings # THIS WILL KEEP PYTHON FROM PRINTING 2000 WARNINGS (IN SOME CASES)!
warnings.simplefilter('error', UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings(action="ignore", category=DeprecationWarning)

# AUDIO PROJECT
import pydub
from pydub import AudioSegment
from pydub.silence import split_on_silence
import librosa as lr
from librosa.display import specshow,waveshow
import IPython

import scipy
from scipy.fft import rfft, rfftfreq
from scipy.io import wavfile

# GENERAL
import pandas as pd
import statistics as st
from scipy import stats as scst
import random
import numpy as np
from itertools import combinations
import os

import matplotlib.pyplot as plt
import matplotlib.colors
from matplotlib.pyplot import figure
import matplotlib
from mpl_toolkits.mplot3d import Axes3D

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import datasets
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

import seaborn as sns
sns.set_style("darkgrid")

## Functions

In [2]:
temp_dir_clips = 'commonvoice\\train\\clips\\'
temp_df = pd.read_csv('commonvoice\\train\\train.tsv', sep='\t')
temp_df1 = pd.read_csv('commonvoice\\test\\test.tsv', sep='\t')
temp_df2 = pd.read_csv('commonvoice\\validation\\validation.tsv', sep='\t')
temp_list_df = [temp_df, temp_df1, temp_df2]
temp_aud_ext = '.wav'
#####


# AUDIO PROJECT
def apply_pydub(in_aud_path): # In case we forget "AudioSegment.from_wav"
    return AudioSegment.from_wav(in_aud_path)

def apply_lr(in_aud_path): # In case we forget "lr.load"
    return lr.load(in_aud_path)

def CountWordsText(in_str):
    return len(in_str.split(' '))

def match_target_amplitude(in_aud_pydub, in_target_dBFS):
    return in_aud_pydub.apply_gain(in_target_dBFS - in_aud_pydub.dBFS)

def CountWordsAudio(in_aud_pydub, in_msl=15, in_st=-2, in_target_dBFS=15):
    return len(split_on_silence(match_target_amplitude(in_aud_pydub, in_target_dBFS), min_silence_len = in_msl, silence_thresh = in_st))

def CountWordsAudio_applyy(in_aud_path, in_msl=15, in_st=-2, in_target_dBFS=15):
    return len(split_on_silence(match_target_amplitude(apply_pydub(in_aud_path), in_target_dBFS), min_silence_len = in_msl, silence_thresh = in_st))

def CountWordsAudio_apply(in_aud_path):
    return CountWordsAudio(apply_pydub(in_aud_path))

def TO_aud_path(in_aud_name, in_dir=temp_dir_clips, in_aud_ext=temp_aud_ext):
    return in_dir + in_aud_name + in_aud_ext


def age_TO_num(in_word):
    if in_word == 'twenties':
        return 25
    elif in_word == 'thirties':
        return 35
    elif in_word == 'fourties':
        return 45
    elif in_word == 'fifties':
        return 55
    elif in_word == 'sixties':
        return 65
    elif in_word == 'seventies':
        return 75
    elif in_word == 'eighties':
        return 85
    elif in_word == 'nineties':
        return 95
    else:
        return 115


def new_col(in_new, in_old, in_func, in_df=temp_df):
    in_df[in_new] = list(map(in_func, in_df[in_old].to_list()))
    return in_df

def new_cols(in_new, in_old, in_list_func, in_list_df=temp_list_df): # DEPENDS on new_col
    for i in range(len(in_list_df)):
        new_col(in_new, in_old, in_list_func[i], in_list_df[i])
    return in_list_df


def smush_TO_mean(in_col1, in_col2, in_df=temp_df): # E.g. to plot 'gender' vs WordCountAudio
    v = []
    types = list( in_df.loc[pd.isnull(in_df[in_col1]) == False][in_col1].unique())
    types.sort()
    for typei in types:
        v.append( st.mean( in_df.loc[in_df[in_col1] == typei ][in_col2] ) )
    return [types,v]

def smush_TO_median(in_col1, in_col2, in_df=temp_df): # E.g. to plot 'gender' vs WordCountAudio
    v = []
    types = list( in_df.loc[pd.isnull(in_df[in_col1]) == False][in_col1].unique())
    types.sort()
    for typei in types:
        v.append( st.median( in_df.loc[in_df[in_col1] == typei ][in_col2] ) )
    return [types,v]


def Scatter(in_colx, in_coly, in_df=temp_df):
    temp_df = in_df.loc[pd.isnull(in_df[in_colx]) == False]
    temp_df = temp_df.sort_values(in_colx) 
    plt.scatter(temp_df[in_colx], temp_df[in_coly])
    plt.xlabel(in_colx)
    plt.ylabel(in_coly)
    plt.title(in_colx + ' vs ' + in_coly)
    return plt.show()

def PlotMean(in_colx, in_coly, in_df=temp_df):
    smush = smush_TO_mean(in_colx, in_coly, in_df)
    plt.plot(smush[0],smush[1])
    plt.xlabel(in_colx)
    plt.ylabel(in_coly)
    plt.title(in_colx + ' vs MEAN ' + in_coly)
    return plt.show()

def PlotMedian(in_colx, in_coly, in_df=temp_df):
    smush = smush_TO_median(in_colx, in_coly, in_df)
    plt.plot(smush[0],smush[1])
    plt.xlabel(in_colx)
    plt.ylabel(in_coly)
    plt.title(in_colx + ' vs MEDIAN ' + in_coly)
    return plt.show()

def PlotAll(in_colx, in_coly, in_df=temp_df):
    Scatter(in_colx, in_coly, in_df)
    print('\n')
    PlotMean(in_colx, in_coly, in_df)
    print('\n')
    PlotMedian(in_colx, in_coly, in_df)
    return print('\n'*2, '___'*10, '\n'*2)


def func_TO_num(in_func):
    return int(int(TRim_df.columns.to_list().index(in_func)) - len(TR_features_df.columns.to_list()))

def num_TO_func(in_num):
    return TRim_df.columns.to_list()[in_num + len(TR_features_df.columns.to_list())]


# https://musicinformationretrieval.com/tempo_estimation.html
# https://librosa.org/doc/0.9.1/beat.html
def TO_tempo(in_aud_path):
    a = lr.load(in_aud_path)
    return lr.beat.tempo(a[0], sr=a[1])[0]


def TO_onset_detect_len(in_aud_path):
    a = lr.load(in_aud_path)
    return len(list(lr.onset.onset_detect(a[0])))

def mean_diff(in_list):
    v=[]
    for i in range(len(in_list)-1):
        v.append(in_list[i+1]-in_list[i])
    return st.mean(v)

def TO_onset_detect_mean_diff(in_aud_path):
    a = lr.load(in_aud_path)
    return mean_diff(list(lr.onset.onset_detect(a[0])))

def TO_onset_stren_mean(in_aud_path):
    a = lr.load(in_aud_path)
    return st.mean(list(lr.onset.onset_strength(a[0])))

def TO_onset_stren_median(in_aud_path):
    a = lr.load(in_aud_path)
    return st.median(list(lr.onset.onset_strength(a[0])))


def TO_sp_centroid_mean(in_aud_path):
    a = lr.load(in_aud_path)
    return st.mean(list(lr.feature.spectral_centroid(a[0], sr=a[1])[0]))

def TO_sp_centroid_median(in_aud_path):
    a = lr.load(in_aud_path)
    return st.median(list(lr.feature.spectral_centroid(a[0], sr=a[1])[0]))

def TO_sp_rolloff_mean(in_aud_path):
    a = lr.load(in_aud_path)
    return st.mean(list(lr.feature.spectral_rolloff(a[0],sr=a[1])[0]))

def TO_sp_rolloff_median(in_aud_path):
    a = lr.load(in_aud_path)
    return st.median(list(lr.feature.spectral_rolloff(a[0],sr=a[1])[0]))

def TO_sp_flattness_mean(in_aud_path):
    a = lr.load(in_aud_path)
    return st.mean(list(lr.feature.spectral_flatness(a[0])[0]))

def TO_sp_flattness_median(in_aud_path):
    a = lr.load(in_aud_path)
    return st.median(list(lr.feature.spectral_flatness(a[0])[0]))


def TO_polyfeats_0_mean(in_aud_path):
    a = lr.load(in_aud_path)
    return st.mean(list(lr.feature.poly_features(a[0])[0]))

def TO_polyfeats_1_mean(in_aud_path):
    a = lr.load(in_aud_path)
    return st.mean(list(lr.feature.poly_features(a[0])[1]))

def TO_polyfeats_0_median(in_aud_path):
    a = lr.load(in_aud_path)
    return st.median(list(lr.feature.poly_features(a[0])[0]))

def TO_polyfeats_1_median(in_aud_path):
    a = lr.load(in_aud_path)
    return st.median(list(lr.feature.poly_features(a[0])[1]))


# GENERAL
def Display(in_list):
    for i in in_list:
        print(i)
    return in_list

def div(a=0,b=1):
    return a/b

def Map(in_func, in_list):
    return list(map(in_func, *in_list))[0]


#####
my_size=1

TR_df = pd.read_csv('TR_df_DongJoanne.csv', sep='\t')

TRim_df = TR_df.drop(columns=['client_id', 'aud_path', 'sentence', 'up_votes', 'down_votes', 'age', 'num_words', 'count15n-215_aud_0', 'acc_ratio_count15n-215_0'])
TRim_df = TRim_df.copy()

TR_features_df = TRim_df.iloc[:,:21]
TR_funcs_df = TRim_df.iloc[:,21:]

TR_df0 = TR_features_df[['age_num', 'gender', 'accent']]
TR_df1 = TR_features_df[['tempo_onsets', 'num_onsets', 'tempo', 'onset_detect_len', 'onset_detect_mean_diff', 'onset_stren_mean', 'onset_stren_median']]
TR_df2 = TR_features_df[['dom_freq', 'sp_centroid_mean', 'sp_centroid_median', 'sp_rolloff_mean', 'sp_rolloff_median', 'sp_flattness_mean', 'sp_flattness_median']]
TR_df3 = TR_features_df[['polyfeats_0_mean', 'polyfeats_1_mean', 'polyfeats_0_median', 'polyfeats_1_median']]

TR_df00 = pd.concat([TR_df0,TR_funcs_df], axis=1)
TR_df11 = pd.concat([TR_df1,TR_funcs_df], axis=1)
TR_df22 = pd.concat([TR_df2,TR_funcs_df], axis=1)
TR_df33 = pd.concat([TR_df3,TR_funcs_df], axis=1)

l = len(TR_funcs_df.columns.to_list())
h = len(TR_funcs_df.iloc[:,0])
for i in range(l):
    TRim_df[str(i)] = [my_size*i]*h

features = ['age_num', 'gender', 'accent', 
            'tempo_onsets', 'num_onsets', 'tempo', 'onset_detect_len', 'onset_detect_mean_diff', 'onset_stren_mean', 'onset_stren_median',
            'dom_freq', 'sp_centroid_mean', 'sp_centroid_median', 'sp_rolloff_mean', 'sp_rolloff_median', 'sp_flattness_mean', 'sp_flattness_median',
            'polyfeats_0_mean', 'polyfeats_1_mean', 'polyfeats_0_median', 'polyfeats_1_median']


my_norm = plt.Normalize(1,1)
my_cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ["white","black","white"])
my_mark_size = .1
my_alpha = 1
my_marker = 'o'
my_figsize = (16,9)
my_df = TRim_df.copy()
# my_figsize = plt.rcParamsDefault["figure.figsize"]  # THIS WILL RESET THE INLINE FIGURE PLOT!  NICE!
#####


def feat_TO_categ(in_feat):
    if in_feat in ['age_num', 'gender', 'accent']:
        return True
    else:
        return False

def ScatColor(in_colx, in_df=my_df, in_norm=my_norm, in_cmap=my_cmap, in_size=my_mark_size, in_alpha=my_alpha, in_marker=my_marker, in_figsize=my_figsize):
    t_df = TRim_df.loc[pd.isnull(TRim_df[in_colx]) == False].copy()
    t_df = t_df.sort_values(in_colx)
    t_df0 = t_df.copy()
    t_df0['row'] = list(range(len(t_df0.iloc[:,0])))
    for i in range(64):
        plt.scatter(t_df0['row'], t_df0[str(i)], c=t_df0[num_TO_func(i)], cmap=my_cmap, norm=my_norm, s=in_size, marker=in_marker, alpha=my_alpha)
    if feat_TO_categ(in_colx) == True:
        for ii in range(int(len(list(t_df0.loc[:,in_colx].unique()))-1)):
            t_x = t_df0.loc[:,in_colx].to_list().index(list(t_df0.loc[:,in_colx].unique())[ii+1])
            plt.axvline(x=t_x, color='k')
    else:
        t_v = []
        t_l = t_df0.loc[:,in_colx].to_list()
        t_M = max(t_l)
        t_m = min(t_l)
        for ii in range(len(t_l)):
            t_v.append((t_l[ii]-t_m)*(2000/t_M))
        for ii in range(3):
            plt.axvline(x=st.quantiles(t_v)[ii], color='k', linestyle='--')
    plt.xlabel(in_colx.capitalize())
    plt.ylabel('Range of functions')
    plt.axis([0, 2000, -1, int(my_size*63)])
    plt.colorbar()
    plt.rcParams["figure.figsize"] = my_figsize
    # plt.rcParams["figure.figsize"] = my_figsize
    t_str = 'FunctionsBWPrecise_' + in_colx + '.png'
    plt.savefig(t_str)  # Can we figure out a way to save all of these images (effortlessly)?
    return plt.show()


#####
MedMaker_df = TRim_df.loc[:,num_TO_func(0):num_TO_func(63)].copy()

my_thresh = .85
my_df = MedMaker_df.copy()
#####


def get_meds(in_thresh=my_thresh, in_df=my_df):
    t_rows = len(in_df.iloc[:,0].to_list())
    out_meds = []
    for i in range(t_rows):
        t_list = in_df.iloc[i,:].to_list()
        t_indices = [index for index, elt in enumerate(t_list) if in_thresh <= elt and elt <= 1/in_thresh]
        if len(t_indices) == 0:
            out_meds.append(None)
        else:
            out_meds.append( int(st.median(t_indices)) )
    return out_meds

#####
gm30 = get_meds(.30, my_df)
gm85 = get_meds(.85, my_df)