In [None]:
import pandas, numpy, scipy, seaborn, sklearn
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy import stats

## functions & options


In [None]:
input_file_directory = '/Users/kja11/OneDrive - Háskóli Íslands/PhD ATG7//0 in_silico/Python/1)data_input/'
output_file_directory = '/Users/kja11/OneDrive - Háskóli Íslands/PhD ATG7/0 in_silico/Python/3)output/'

# I] Dataframe settings

In [None]:
%%time
#Download the big expression data with all the transcripts
path = input_file_directory + "/xenabrowser_brut_data/all_transcripts/gtex_Kallisto_tpm.tsv"
df = pandas.read_csv(path, sep = "\t", index_col='sample')

print(df.shape)
df.head()

In [None]:
# #Do a subset to practice and save it
# # 50% of the transcrits of the full samples
# df = df.iloc[:98522,:3931]

# #save to csv
# path = "/xenabrowser_brut_data/all_transcripts/"
# df.to_csv(input_file_directory+path+'gtex_Kallisto_tpm_50perc_part.tsv',sep = "\t")

In [None]:
%%time
#Download 50% of the big expression data with all the transcripts
path = input_file_directory + "/xenabrowser_brut_data/all_transcripts/gtex_Kallisto_tpm_50perc_part.tsv"
df = pandas.read_csv(path, sep = "\t")

print(df.shape)

df.set_index('Unnamed: 0', inplace = True)
df.index.name = None
df.head()

In [None]:
%%time
#Calcul to have value from log2;  2**(x)-0.001
df = df.apply(lambda x: pow(2,x)-0.001)

#Calcul to have log2+1 from value;
df = df.apply(lambda x: numpy.log2(x+1))
df.head()

In [None]:
genes = "ENST00000354449.7", "ENST00000354956.9"
g_names = "ATG7_1", "ATG7_2"
for i in range(len(genes)):
    print(f'{g_names[i]}: {df.loc[genes[i], :].sum()}')

In [None]:
ATG7_2_sum = 217.98617066869969

print('shape before filtering', df.shape)
#create boolean and select the genes where sum is higher than ATG7(2) sum expression
bool_higher_iso2 = df.sum(axis=1) > ATG7_2_sum 
df = df[bool_higher_iso2]

print('shape after filtering', df.shape)

In [None]:
# #1%
# small_smallpart = df.iloc[:1970,:78]

In [None]:
%%time
#evaluate the goodness score for each cluster to select the best possibility.
cluster_numbers = list(range(2, 40))
number_clust_L = [] 
goodness_chs_L = []
goodness_dbi_L = []
for nb in cluster_numbers:
    kmeans_model = KMeans(n_clusters = nb, random_state=1).fit(df)
    labels = kmeans_model.labels_
    goodness_chs = metrics.calinski_harabasz_score(df, labels)
    goodness_dbi = metrics.davies_bouldin_score(df, labels)
    
    #save score in list
    number_clust_L.append(nb)
    goodness_chs_L.append(goodness_chs)
    goodness_dbi_L.append(goodness_dbi)

#convert to dict then to DF
d = dict(Number = number_clust_L, Goodness_CHS = goodness_chs_L, Goodness_DBI = goodness_dbi_L)
df = pandas.DataFrame.from_dict(d, orient='columns')
df.head()

In [None]:
#plot the Goodness on the number to determine the best one.
##calinski_harabasz_score
seaborn.lineplot(data = df, x = 'Number', y = 'Goodness_CHS', marker="o")
plt.grid(ls=':', alpha = 0.5)
print('the elbow curve is at between 6 and 12')

In [None]:
#davies_bouldin_score
seaborn.lineplot(data = df, x = 'Number', y = 'Goodness_DBI', marker="o")
plt.grid(ls=':', alpha = 0.5)
print('the plateau start at 7')

In [None]:
# 10 is the best number of cluster
kmeans_model = KMeans(n_clusters = 7, random_state=1).fit(df)
# Find what cluster for each gene
pred_cluster = kmeans_model.predict(df)
print(pred_cluster)
#add the prediction in a column
df['cluster_7'] = pred_cluster
df.head()

In [None]:
# #ATG7 is not in the list
# ATG7_1 = 'ENST00000354449.7'
# ATG7_2 = 'ENST00000354956.9'

# print(df['cluster_10'][[ATG7_1, ATG7_2]])

In [None]:
#let's take random ones
geneX = 'ENST00000552583.1'
geneY = 'ENST00000486061.1'

print(df['cluster_7'][[geneX, geneY]])

In [None]:
'The two genes are not in the same cluster. They are different'