# <font color= #900c3f >MUIT - TSA 2018</font>
## UNSUPERVISED Acoustic Analysis of Speech from Sleep Apnea Patients



===================================================================
## Extracting data: $Formants$ from sustained vowel /i/
<br>

- ### UPLOAD: OSA_Excel.zip file from [TSA GitHub](http://https://github.com/MUIT-TSA/Python) 

In [None]:
# ! unzip /resources/data/audio/OSA/OSA_Excel.zip -d /resources/data/audio/OSA/

### <font color=  #dc7633  >Now we will use Pandas DataFrames to read and analyze data</font>

In [None]:
import pandas as pd 

In [None]:
! pip install xlrd

### ... read only one file...

In [None]:
file = '/resources/data/audio/OSA/OSA_188.xls'
df1 = pd.read_excel(file,sheetname='Sheet1')

In [None]:
df1

### ... then read all files and concatenate DataFrames

In [None]:
## Get a list with all xls files in /resources/data/audio/OSA/

import os

exPath='/resources/data/audio/OSA/'
fileList=os.listdir(exPath)


### ... read all the files in the list
### and concatenate all dataframes into a df_OSA dataframe

In [None]:
df_OSA=pd.DataFrame()  # an empty DataFrame

for exFile in fileList:
    if exFile.endswith('.xls'):
        # print(exFile)
        df = pd.read_excel(exPath+exFile,sheetname='Sheet1')
        df_OSA=pd.concat([df_OSA,df], axis=0)

# <font color=  #e53714 >1.- PCA Analysis</font>

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

#In general a good idea is to scale the data
scaler = StandardScaler()
scaler.fit(df_OSA)
m_OSA=scaler.transform(df_OSA)

In [None]:
m_OSA.std(axis=0)

In [None]:
pca = PCA()
OSA_new = pca.fit_transform(m_OSA)

In [None]:
OSA_new.shape

In [None]:
## project: PC Rotations

# 0,1 denote PC1 and PC2; change values for other PCs
xvector = pca.components_[0]
yvector = pca.components_[1]

In [None]:
xvector.shape

In [None]:
labels=list(df_OSA.columns.values)
labels

In [None]:
xvector

In [None]:
yvector

In [None]:
xs = pca.transform(m_OSA)[:,0]
ys = pca.transform(m_OSA)[:,1]

In [None]:
## visualize variance explained

import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(pca.explained_variance_ratio_.cumsum())

plt.xlim(0,9)
plt.ylim(0,1)
plt.xlabel("PC number")
plt.ylabel("Variance explained")

plt.grid()

In [None]:
xs.shape

## ...obtaining proyections (i.e. scores)

In [None]:
import numpy as np

xs1=np.matmul(m_OSA,xvector)

In [None]:
xs1[0:5]

In [None]:
xs[0:5]

In [None]:
OSA_new[0:5,0]

## Approaching a ByPlot

In [None]:
labels=list(df_OSA.columns.values)
labels

In [None]:
## visualize projections

import matplotlib.pyplot as plt
%matplotlib inline



def myplot(score,coeff,labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.figure(figsize=(18,12))
    plt.scatter(xs * scalex,ys * scaley)
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
    
    plt.xlim(-1,1)
    plt.ylim(-1,1)
    plt.xlabel("PC{}".format(1))
    plt.ylabel("PC{}".format(2))

    plt.grid()


#Call the function. Use only the 2 PCs.

myplot(OSA_new[:,0:2],np.transpose(pca.components_[0:2, :]),labels)
    
    

# <font color=  #e53714 >2.- PCA Clustering</font>

https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/


- ## Linkage, see:

https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

# generate the linkage matrix
Z = linkage(df_OSA, 'ward')


In [None]:

# calculate full dendrogram
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()



- #  **Truncated dendrogram**: only shows the last $p$ out of our all merges.

In [None]:
plt.figure(figsize=(15, 10))
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('sample index or (cluster size)')
plt.ylabel('distance')
dendrogram(
    Z,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=6,  # show only the last p merged clusters
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,  # to get a distribution impression in truncated branches
)
plt.show()

- # Limit the maximum number of clusters (also could be maximum distance)

In [None]:
from scipy.cluster.hierarchy import fcluster
max_d = 2
clusters = fcluster(Z, max_d, criterion='maxclust')



In [None]:
clusters

In [None]:
# See two clusters in two-PC space

plt.figure(figsize=(10, 8))
plt.scatter(OSA_new[:,0], OSA_new[:,1], c=clusters, cmap='prism')  # plot points with cluster dependent colors
plt.show()



### ...see scatter by Gender

In [None]:
# See Gender labels in two-PC space

plt.figure(figsize=(10, 8))
plt.scatter(OSA_new[:,0], OSA_new[:,1], c=df_OSA['Gender'], cmap='prism')  # plot points with cluster dependent colors
plt.show()



# Activity: Analyze TwoFormants_male dataset

In [None]:
file = '/resources/data/audio/TwoFormants_male.xlsx'
df_Formants = 

In [None]:
df_Formants.head(5)

## Try clusters using F1 and F2


In [None]:

# calculate full dendrogram



In [None]:
# Plot a truncated Dendogram



In [None]:
# Scatter plot with 5 clusters


## ... analyze results

In [None]:
# Vowels ??