In [1]:
import os
import pandas as pd
import numpy as np
import glob
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_folder = '../dataCSV/'
# organs = ["artery_coronary", "muscle_skeletal", "whole_blood", "skin_sun_exposed_lower_leg", "lung", "liver", "heart_left_ventricle", "nerve_tibial", "artery_aorta", "colon_transverse", "colon_sigmoid"]
organs = ["pituitary","kidney_cortex","pancreas","thyroid","adipose_subcutaneous","nerve_tibial","brain_caudate_basal_ganglia"]

In [3]:
# Function to load and prepare data for clustering
def load_data_for_organ(organ_name):
    csv_file = glob.glob(os.path.join(data_folder, f'gene_tpm_2017-06-05_v8_{organ_name}.csv'))[0]
    df = pd.read_csv(csv_file)
    expression_data = df.iloc[:, 3:].transpose()
    return expression_data

In [4]:
organ_data = {}
for organ in organs:
    organ_data[organ] = load_data_for_organ(organ)

In [5]:
combined_data = pd.concat(organ_data.values())
combined_data.fillna(combined_data.mean(), inplace=True)
combined_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56190,56191,56192,56193,56194,56195,56196,56197,56198,56199
GTEX-1117F-0626-SM-5N9CS,0.0000,11.070,0.0,0.0676,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,11990.0,0.0000,0.0000,0.000,1237.0,817.3,0.000,8799.0,0.000,0.000
GTEX-1122O-0426-SM-5H12G,0.0311,2.469,0.0,0.0000,0.0000,0.0640,0.0572,0.0000,0.0000,0.0000,...,8671.0,0.7789,0.0000,0.000,6198.0,11830.0,59.980,15090.0,1.629,2.371
GTEX-117YX-1726-SM-5GZZS,0.0000,1.926,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0602,0.0238,...,23350.0,1.1950,0.0000,0.000,6379.0,12580.0,44.230,13700.0,0.000,2.426
GTEX-11DXX-0826-SM-5GZZP,0.0000,3.866,0.0,0.0529,0.0000,0.0000,0.0488,0.0000,0.0000,0.0397,...,11050.0,0.6643,0.7769,0.000,5009.0,8159.0,30.560,14400.0,1.389,8.089
GTEX-11DXY-1126-SM-5987W,0.0552,4.237,0.0,0.0000,0.0391,0.0000,0.0507,0.0000,0.0000,0.0000,...,11610.0,1.3820,0.8082,1.343,2083.0,1604.0,3.455,11330.0,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTEX-ZUA1-0011-R5b-SM-51MTG,0.0000,1.607,0.0,0.1174,0.0000,0.0000,0.0721,0.0738,0.0000,0.0196,...,43640.0,56.4600,3.4450,32.440,8397.0,9740.0,24.060,36010.0,3.080,33.380
GTEX-ZV68-0011-R5a-SM-4YCDW,0.0000,1.303,0.0,0.0333,0.0473,0.1373,0.0920,0.0628,0.0280,0.0333,...,66420.0,11.2800,2.9320,18.270,14870.0,17050.0,66.020,45940.0,7.427,12.720
GTEX-ZVT3-0011-R5a-SM-51MSI,0.0000,1.003,0.0,0.0000,0.0000,0.0528,0.0708,0.0242,0.0000,0.0000,...,63650.0,10.9300,4.1370,6.563,14830.0,15320.0,617.800,50480.0,3.362,17.620
GTEX-ZVZQ-0011-R5b-SM-57WDC,0.0326,1.864,0.0,0.0325,0.0231,0.0335,0.1197,0.0613,0.0274,0.0163,...,59800.0,43.6300,8.1070,6.341,15350.0,17190.0,115.000,49720.0,389.200,13.650


In [6]:
organ_labels = []
for organ, data in organ_data.items():
    organ_labels.extend([organ] * data.shape[0])

In [7]:
n_clusters = organs.__len__()
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(combined_data)
combined_data['Cluster'] = kmeans.labels_
combined_data['Cluster']

In [None]:
combined_data['Organ'] = organ_labels
combined_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56192,56193,56194,56195,56196,56197,56198,56199,Cluster,Organ
GTEX-1117F-0626-SM-5N9CS,0.0000,11.070,0.0,0.0676,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,1237.0,817.3,0.0000,8799.0,0.0000,0.0000,4,artery_coronary
GTEX-1122O-0426-SM-5H12G,0.0311,2.469,0.0,0.0000,0.0000,0.0640,0.0572,0.0000,0.0000,0.0000,...,0.0000,0.0000,6198.0,11830.0,59.9800,15090.0,1.6290,2.3710,4,artery_coronary
GTEX-117YX-1726-SM-5GZZS,0.0000,1.926,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0602,0.0238,...,0.0000,0.0000,6379.0,12580.0,44.2300,13700.0,0.0000,2.4260,4,artery_coronary
GTEX-11DXX-0826-SM-5GZZP,0.0000,3.866,0.0,0.0529,0.0000,0.0000,0.0488,0.0000,0.0000,0.0397,...,0.7769,0.0000,5009.0,8159.0,30.5600,14400.0,1.3890,8.0890,4,artery_coronary
GTEX-11DXY-1126-SM-5987W,0.0552,4.237,0.0,0.0000,0.0391,0.0000,0.0507,0.0000,0.0000,0.0000,...,0.8082,1.3430,2083.0,1604.0,3.4550,11330.0,0.0000,0.0000,4,artery_coronary
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTEX-ZYFD-2226-SM-5E43P,0.0443,5.167,0.0,0.0000,0.0314,0.0000,0.0000,0.1253,0.0186,0.0554,...,1.2990,0.5399,2497.0,2519.0,3.8890,19690.0,0.5808,0.0000,0,colon_sigmoid
GTEX-ZYFG-1826-SM-5GZWX,0.0000,3.710,0.0,0.0000,0.0000,0.1479,0.0881,0.1353,0.0403,0.0239,...,0.7018,0.5832,6815.0,7968.0,23.4000,28270.0,0.0000,1.8270,0,colon_sigmoid
GTEX-ZYT6-2826-SM-5GICX,0.0000,6.504,0.0,0.0669,0.0000,0.0000,0.0000,0.1263,0.0000,0.0000,...,0.0000,0.0000,2099.0,1545.0,0.8401,18620.0,0.0000,0.8525,0,colon_sigmoid
GTEX-ZYY3-2226-SM-5E45A,0.0328,7.734,0.0,0.0655,0.0466,0.0675,0.2415,0.0000,0.0276,0.0656,...,1.9230,192.6000,2830.0,1981.0,4.1120,24910.0,0.8597,0.8345,0,colon_sigmoid


In [None]:
tsne = TSNE(n_components=2, perplexity=30, n_iter=300, random_state=42)
reduced_data = tsne.fit_transform(combined_data.drop(columns=['Cluster', 'Organ']))
reduced_data



array([[11.618501  , -2.5020509 ],
       [ 8.81861   , -6.0637884 ],
       [ 5.640093  , -7.5444503 ],
       ...,
       [ 1.4606508 ,  0.10442891],
       [ 1.6902777 , -0.04555428],
       [-2.4823334 , -1.0615352 ]], dtype=float32)

In [None]:
reduced_df = pd.DataFrame(data=reduced_data, columns=['Dim1', 'Dim2'])
reduced_df['Cluster'] = combined_data['Cluster'].values
reduced_df['Organ'] = combined_data['Organ'].values
reduced_df

Unnamed: 0,Dim1,Dim2,Cluster,Organ
0,11.618501,-2.502051,4,artery_coronary
1,8.818610,-6.063788,4,artery_coronary
2,5.640093,-7.544450,4,artery_coronary
3,8.730314,-6.038520,4,artery_coronary
4,11.017185,-2.657732,4,artery_coronary
...,...,...,...,...
5560,0.864246,0.617755,0,colon_sigmoid
5561,-3.829877,0.127707,0,colon_sigmoid
5562,1.460651,0.104429,0,colon_sigmoid
5563,1.690278,-0.045554,0,colon_sigmoid


In [None]:
plt.figure(figsize=(14, 10))

# First plot by Cluster
plt.subplot(1, 2, 1)
sns.scatterplot(x='Dim1', y='Dim2', hue='Cluster', palette= 'tab20', data=reduced_df, s=100, alpha=0.7)
plt.title('t-SNE of Gene Expression Data with K-means Clustering')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Second plot by Organ
plt.subplot(1, 2, 2)
sns.scatterplot(x='Dim1', y='Dim2', hue='Organ', palette='tab20', data=reduced_df, s=100, alpha=0.7)
plt.title('t-SNE of Gene Expression Data by Organ')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.savefig(f'./OrganClusteringImages/k{n_clusters}_2.png')
plt.close()

In [None]:
# for n_clusters in range(11, 22):
#     kmean_data = combined_data
#     kmeans = KMeans(n_clusters=n_clusters, random_state=42)
#     kmeans.fit(kmean_data)
#     kmean_data['Cluster'] = kmeans.labels_
#     kmean_data['Organ'] = organ_labels
#     tsne = TSNE(n_components=2, perplexity=30, n_iter=300, random_state=42)
#     reduced_data = tsne.fit_transform(kmean_data.drop(columns=['Cluster', 'Organ']))
#     reduced_df = pd.DataFrame(data=reduced_data, columns=['Dim1', 'Dim2'])
#     reduced_df['Cluster'] = kmean_data['Cluster'].values
#     reduced_df['Organ'] = kmean_data['Organ'].values
#     plt.figure(figsize=(14, 10))

#     # First plot by Cluster
#     plt.subplot(1, 2, 1)
#     sns.scatterplot(x='Dim1', y='Dim2', hue='Cluster', palette='tab20', data=reduced_df, s=100, alpha=0.7)
#     plt.title('t-SNE of Gene Expression Data with K-means Clustering')

#     # Second plot by Organ
#     plt.subplot(1, 2, 2)
#     sns.scatterplot(x='Dim1', y='Dim2', hue='Organ', palette='tab20', data=reduced_df, s=100, alpha=0.7)
#     plt.title('t-SNE of Gene Expression Data by Organ')
#     plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

#     plt.tight_layout()
#     plt.savefig(f'./Images/k{n_clusters}.png')
#     plt.close()