In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import seaborn as sns

Concat all dataframes together

In [None]:
seg=[len(df_M2_non_vis),len(df_M2_0p2),len(df_M2_0p4),len(df_M2_0p6),len(df_HTF_non_vis),len(df_HTF_0p2),len(df_HTF_0p4),len(df_HTF_0p6),len(df_SW_invivo_vis)]

In [None]:
for i in range(1,len(seg)):
  seg[i]+=seg[i-1]

In [None]:
df_all=pd.concat([df_M2_non_vis,df_M2_0p2,df_M2_0p4,df_M2_0p6,df_HTF_non_vis,df_HTF_0p2,df_HTF_0p4,df_HTF_0p6,df_SW_invivo_vis])

Parameters used for PCA and Kmean

In [None]:
parameters=[['SDofDV','VCL','VSL','VAP','LIN','STR','WOB'],['SDofDV','LIN','STR','WOB']]
mode=1

Normalization

In [None]:
mm=MinMaxScaler()

In [None]:
Allmtrx=np.array(df_all.loc[:,parameters[mode]])
mm_All=mm.fit_transform(Allmtrx)

PCA

In [None]:
pca = PCA(n_components=len(parameters[mode]))
principalComponents = pca.fit_transform(mm_All)
pca.explained_variance_ratio_

In [None]:
importance = 100*pca.explained_variance_ratio_
plt.figure(figsize = (15,8))
plt.scatter(range(1,len(parameters[mode])+1),importance)
plt.plot(range(1,len(parameters[mode])+1),importance)
for i in range(1,len(parameters[mode])+1):
    plt.annotate(round(importance[i-1],2),(i,importance[i-1]),textcoords="offset points",xytext=(-5,8),size=13)
plt.title('Screen Plot')
plt.xlabel('Principal Component number')
plt.ylabel('Eigenvalue(%)')
plt.grid()
plt.savefig('OCT_dimension',dpi=200)
plt.show()

In [None]:
pca = PCA(n_components=2)
PrincipalComponents_all=pca.fit_transform(mm_All)
principalDF_all=pd.DataFrame(data=PrincipalComponents_all,columns=['principal component 1','principal component 2'])
pca.components_

In [None]:
df_cm = pd.DataFrame(np.abs(pca.components_), columns=parameters[mode])
plt.figure(figsize = (20,6))
ax = sns.heatmap(df_cm, annot=True, cmap="BuPu")
ax.yaxis.set_tick_params(labelsize=15)
ax.xaxis.set_tick_params(labelsize=15)
plt.title('PCA', fontsize='xx-large')
# plt.savefig('OCT_factorAnalysis.png', dpi=200)
plt.show()

In [None]:
mm_pca_M2_0=np.array(principalDF_all[:seg[0]])  #0
mm_pca_M2_2=np.array(principalDF_all[seg[0]:seg[1]]) #1
mm_pca_M2_4=np.array(principalDF_all[seg[1]:seg[2]]) #2
mm_pca_M2_6=np.array(principalDF_all[seg[2]:seg[3]]) #3
mm_pca_HTF_0=np.array(principalDF_all[seg[3]:seg[4]]) #4
mm_pca_HTF_2=np.array(principalDF_all[seg[4]:seg[5]]) #5
mm_pca_HTF_4=np.array(principalDF_all[seg[5]:seg[6]]) #6
mm_pca_HTF_6=np.array(principalDF_all[seg[6]:seg[7]]) #7
mm_pca_SW=np.array(principalDF_all[seg[7]:]) #8

In [None]:
plotList=[[mm_pca_M2_0,0,seg[0]],[mm_pca_M2_2,seg[0],seg[1],'3.5 mPa·s M2'],[mm_pca_M2_4,seg[1],seg[2],'10.5 mPa·s M2'],[mm_pca_M2_6,seg[2],seg[3],'29.4 mPa·s M2'],[mm_pca_HTF_0,seg[3],seg[4]],[mm_pca_HTF_2,seg[4],seg[5],'3.5 mPa·s HTF'],[mm_pca_HTF_4,seg[5],seg[6],'10.5 mPa·s HTF'],[mm_pca_HTF_6,seg[6],seg[7],'29.4 mPa·s HTF'],[mm_pca_SW,seg[7],seg[8],'Vivo']]

In [None]:
df_all=pd.DataFrame(df_all).reset_index(drop=True)
principalDF_all=pd.concat([principalDF_all,df_all],axis=1)

In [None]:
for k in [1,2,3,5,6,7,8]:
    fig = plt.figure(figsize = (9,8))
    ax = fig.add_subplot(1,1,1)
    ax.set_xlabel('Principal Component 1', fontsize = 15)
    ax.set_ylabel('Principal Component 2', fontsize = 15)
    ax.set_title('2 Components PCA', fontsize = 20)
    kmeans = KMeans(n_clusters=2,random_state=0).fit(mm_pca_HTF_0)
    kdot=ax.scatter(principalDF_all.loc[:seg[0],'principal component 1'],principalDF_all.loc[:seg[0],'principal component 2'],c='k')
    for i,j in enumerate(kmeans.labels_):
        if j==1:
            bdot=ax.scatter(principalDF_all.loc[seg[3]+i,'principal component 1'],principalDF_all.loc[seg[3]+i,'principal component 2'],c='b')
        else:
            gdot=ax.scatter(principalDF_all.loc[seg[3]+i,'principal component 1'],principalDF_all.loc[seg[3]+i,'principal component 2'],c='g')

    for i,j in enumerate(kmeans.predict(plotList[k][0])):
        if j==1:
            ydot=ax.scatter(principalDF_all.loc[plotList[k][1]+i,'principal component 1'],principalDF_all.loc[plotList[k][1]+i,'principal component 2'],c='yellow')
        else:
            pdot=ax.scatter(principalDF_all.loc[plotList[k][1]+i,'principal component 1'],principalDF_all.loc[plotList[k][1]+i,'principal component 2'],c='pink')
    ax.legend([kdot,gdot,bdot,pdot,ydot],['Non-hyperactivated sperm in M2','Non-hyperactivated sperm in HTF','Hyperactivated sperm in HTF','Non-hyperactivated sperm in {}'.format(plotList[k][3]),'Hyperactivated sperm in {}'.format(plotList[k][3])])
    ax.grid()
    plt.autoscale()
    plt.savefig("outputpath",bbox_inches='tight',dpi=100)
    plt.show()