In [None]:
import pandas, numpy, scipy, seaborn, sklearn
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.cluster import KMeans
from sklearn import metrics

## functions & options


In [None]:
input_file_directory = '/Users/kja11/OneDrive - Háskóli Íslands/PhD ATG7//0 in_silico/Python/1)data_input/'
output_file_directory = '/Users/kja11/OneDrive - Háskóli Íslands/PhD ATG7/0 in_silico/Python/3)output/'

# I] Dataframe settings

In [None]:
%%time
#Download 10% of the big expression data with all the transcripts
path = input_file_directory + "/xenabrowser_brut_data/all_transcripts/gtex_Kallisto_tpm_10perc.tsv"
df = pandas.read_csv(path, sep = "\t")

print(df.shape)
df.set_index('sample', inplace = True)
df.index.name = None
df.head()

In [None]:
#Calcul to have value from log2;
print('log2(tpm+0.001) ', df.min().min())
df = 2**df
print('tpm ', df.min().min())
df = numpy.log2(df+1)
print('log2(tpm+1) ', df.min().min())
df.head()

In [None]:
#zscore
df = df.apply(stats.zscore)
df.head()

In [None]:
#see what is the total sum expression of the two isoforms
genes = "ENST00000354449.7", "ENST00000354956.9"
g_names = "ATG7_1", "ATG7_2"
for i in range(len(genes)):
    print(f'{g_names[i]}: {df.loc[genes[i], :].sum()}')

In [None]:
ATG7_2_sum = 115.21

print('shape before filtering', df.shape)
#create boolean and select the genes where sum is higher than ATG7(2) sum expression
bool_higher_iso2 = df.sum(axis=1) > ATG7_2_sum 
df = df[bool_higher_iso2]

print('shape after filtering', df.shape)

In [None]:
#look shit
seaborn.heatmap(df) 

In [None]:
%%time
#evaluate the goodness score for each cluster to select the best possibility.
cluster_numbers = list(range(2, 100))

number_clust_L = [] 
goodness_chs_L = []
goodness_dbs_L = []
goodness_ss_L = []

for nb in cluster_numbers:
    kmeans_model = KMeans(n_clusters = nb, random_state=1).fit(df)
    labels = kmeans_model.labels_
    goodness_chs = metrics.calinski_harabasz_score(df, labels)
    goodness_dbs = metrics.davies_bouldin_score(df, labels)
    goodness_ss = sklearn.metrics.silhouette_score(df, labels, metric='cosine')
    
    #save score in list
    number_clust_L.append(nb)
    goodness_chs_L.append(goodness_chs)
    goodness_dbs_L.append(goodness_dbs)
    goodness_ss_L.append(goodness_ss)

#convert to dict then to DF
d = dict(Number = number_clust_L, Goodness_CHS = goodness_chs_L, 
         Goodness_DBS = goodness_dbs_L, Goodness_SS = goodness_ss_L)

df_goodness = pandas.DataFrame.from_dict(d, orient='columns')

df_goodness.head()

In [None]:
#plot the Goodness on the number to determine the best one.
##calinski_harabasz_score
seaborn.lineplot(data = df_goodness, x = 'Number', y = 'Goodness_CHS', marker="o")
plt.grid(ls=':', alpha = 0.5)
plt.title('calinski_harabasz')
print('the elbow curve is at between 8 and 11')


In [None]:
#davies_bouldin_score
seaborn.lineplot(data = df_goodness, x = 'Number', y = 'Goodness_DBS', marker="o")
plt.grid(ls=':', alpha = 0.5)
plt.title('davies_bouldin')
print('the plateau start at 9')

In [None]:
#silhouette_score
seaborn.lineplot(data = df_goodness, x = 'Number', y = 'Goodness_SS', marker="o")
plt.grid(ls=':', alpha = 0.5)
plt.title('silhouette')
print('the plateau start at 6')

In [None]:
# 10 is the best number of cluster
kmeans_model = KMeans(n_clusters = 9, random_state=1, verbose=True).fit(df)
labels = kmeans_model.labels_
# Find what cluster for each gene
pred_cluster = kmeans_model.predict(df)
print(pred_cluster)
#add the prediction in a column
df['cluster'] = pred_cluster
df.head()

In [None]:
#to select the good color map scale
print('max is', df.max().max())
print('min is', df.min().min())

In [None]:
#ATG7 is not in the list
ATG7_1 = 'ENST00000354449.7'
ATG7_2 = 'ENST00000354956.9'

print(df['cluster'][[ATG7_1, ATG7_2]])

In [None]:
'The two genes are not in the same cluster. They are different'

In [None]:
seaborn.hls_palette(9)

In [None]:
#sort the df with the cluster numbers.
df = df.sort_values('cluster')
#pop the cluster column
cluster_uniq = df.pop("cluster")

In [None]:
#use one color per cluster group
lut = dict(zip(cluster_uniq.unique(), seaborn.hls_palette(9)))
row_colors_clust = cluster_uniq.map(lut)
#plot
seaborn.clustermap(df, cmap='bwr', col_cluster=False, row_cluster=False,
                   row_colors= row_colors_clust, 
                   method = "centroid", metric = 'cosine',
                   yticklabels= '', xticklabels = '',
                   cbar_kws={'label':'expression [log2(tpm+1)]'})