In [1]:
import os
import pandas as pd
import numpy as np
import glob
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_folder = '../dataCSV/'
# organs = ["colon_transverse","colon_sigmoid","liver","artery_aorta","lung","skin_sun_exposed_lower_leg","artery_coronary","brain_cortex","brain_cerebellum","adrenal_gland","stomach","heart_atrial_appendage","heart_left_ventricle"]
organs = ["pituitary","kidney_cortex","pancreas","thyroid","adipose_subcutaneous","nerve_tibial","brain_caudate_basal_ganglia"]

In [3]:
# Function to load and prepare data for clustering
def load_data_for_organ(organ_name):
    csv_file = glob.glob(os.path.join(data_folder, f'gene_tpm_2017-06-05_v8_{organ_name}.csv'))[0]
    df = pd.read_csv(csv_file)
    expression_data = df.iloc[:, 3:].transpose()
    return expression_data

In [4]:
organ_data = {}
for organ in organs:
    organ_data[organ] = load_data_for_organ(organ)

In [5]:
dataframe = organ_data[organs[6]]
dataframe.fillna(dataframe.mean(), inplace=True)
dataframe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56190,56191,56192,56193,56194,56195,56196,56197,56198,56199
GTEX-1192X-0011-R5a-SM-DNZZA,0.0,2.137,0.0,0.0575,0.0,0.0889,0.053,0.1085,0.0484,0.0,...,43960.0,64.58,0.4219,3.155,7684.0,6763.0,112.9,32200.0,3.394,3.295
GTEX-11DXW-0011-R5b-SM-DO11D,0.0,0.8156,0.0,0.0431,0.0,0.0667,0.1788,0.2034,0.0091,0.0,...,69510.0,4.601,6.33,4.734,16000.0,15950.0,270.4,50100.0,4.81,5.493
GTEX-11DXY-0011-R5b-SM-DNZZL,0.0,2.67,0.0,0.0,0.0,0.063,0.0,0.0576,0.0129,0.0153,...,41490.0,5.752,2.242,4.844,8638.0,7295.0,19.17,34300.0,3.207,1.167
GTEX-11DYG-0011-R5a-SM-DO928,0.0,1.239,0.0,0.0,0.0,0.0,0.0243,0.0249,0.0222,0.0,...,46490.0,2.318,1.549,9.978,9778.0,7789.0,16.56,40330.0,1.385,2.689
GTEX-11DZ1-0011-R5a-SM-DO11J,0.0,1.363,0.0,0.0275,0.0,0.0284,0.076,0.1297,0.0116,0.0275,...,46080.0,4.487,0.8072,5.702,10670.0,10840.0,41.42,33960.0,4.33,20.66


In [6]:
phenotype = pd.read_csv('GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt', sep='\t')
phenotype.head()

Unnamed: 0,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F,2,60-69,4.0
1,GTEX-111CU,1,50-59,0.0
2,GTEX-111FC,1,60-69,1.0
3,GTEX-111VG,1,60-69,3.0
4,GTEX-111YS,1,60-69,0.0


In [7]:
print(phenotype['AGE'].nunique())
print(phenotype['DTHHRDY'].nunique())

6
5


In [8]:
n_clusters = 6
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(dataframe)
dataframe['Cluster'] = kmeans.labels_
dataframe['Cluster']

GTEX-1192X-0011-R5a-SM-DNZZA    5
GTEX-11DXW-0011-R5b-SM-DO11D    1
GTEX-11DXY-0011-R5b-SM-DNZZL    5
GTEX-11DYG-0011-R5a-SM-DO928    2
GTEX-11DZ1-0011-R5a-SM-DO11J    3
                               ..
GTEX-ZUA1-0011-R5b-SM-51MTG     2
GTEX-ZV68-0011-R5a-SM-4YCDW     1
GTEX-ZVT3-0011-R5a-SM-51MSI     1
GTEX-ZVZQ-0011-R5b-SM-57WDC     4
GTEX-ZXG5-0011-R5b-SM-57WBN     4
Name: Cluster, Length: 246, dtype: int32

In [9]:
dataframe['SUBJID'] = dataframe.index.str.split('-').str[0] + '-' + dataframe.index.str.split('-').str[1]
merged_df = pd.merge(dataframe, phenotype, how='left', on='SUBJID')
merged_df.set_index(dataframe.index, inplace=True)
merged_df.drop(columns=['SUBJID'], inplace=True)
merged_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56194,56195,56196,56197,56198,56199,Cluster,SEX,AGE,DTHHRDY
GTEX-1192X-0011-R5a-SM-DNZZA,0.0000,2.1370,0.0,0.0575,0.0000,0.0889,0.0530,0.1085,0.0484,0.0000,...,7684.0,6763.0,112.90,32200.0,3.394,3.295,5,1,50-59,4.0
GTEX-11DXW-0011-R5b-SM-DO11D,0.0000,0.8156,0.0,0.0431,0.0000,0.0667,0.1788,0.2034,0.0091,0.0000,...,16000.0,15950.0,270.40,50100.0,4.810,5.493,1,1,40-49,2.0
GTEX-11DXY-0011-R5b-SM-DNZZL,0.0000,2.6700,0.0,0.0000,0.0000,0.0630,0.0000,0.0576,0.0129,0.0153,...,8638.0,7295.0,19.17,34300.0,3.207,1.167,5,1,60-69,2.0
GTEX-11DYG-0011-R5a-SM-DO928,0.0000,1.2390,0.0,0.0000,0.0000,0.0000,0.0243,0.0249,0.0222,0.0000,...,9778.0,7789.0,16.56,40330.0,1.385,2.689,2,1,60-69,2.0
GTEX-11DZ1-0011-R5a-SM-DO11J,0.0000,1.3630,0.0,0.0275,0.0000,0.0284,0.0760,0.1297,0.0116,0.0275,...,10670.0,10840.0,41.42,33960.0,4.330,20.660,3,1,50-59,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTEX-ZUA1-0011-R5b-SM-51MTG,0.0000,1.6070,0.0,0.1174,0.0000,0.0000,0.0721,0.0738,0.0000,0.0196,...,8397.0,9740.0,24.06,36010.0,3.080,33.380,2,1,40-49,2.0
GTEX-ZV68-0011-R5a-SM-4YCDW,0.0000,1.3030,0.0,0.0333,0.0473,0.1373,0.0920,0.0628,0.0280,0.0333,...,14870.0,17050.0,66.02,45940.0,7.427,12.720,1,2,50-59,2.0
GTEX-ZVT3-0011-R5a-SM-51MSI,0.0000,1.0030,0.0,0.0000,0.0000,0.0528,0.0708,0.0242,0.0000,0.0000,...,14830.0,15320.0,617.80,50480.0,3.362,17.620,1,2,60-69,2.0
GTEX-ZVZQ-0011-R5b-SM-57WDC,0.0326,1.8640,0.0,0.0325,0.0231,0.0335,0.1197,0.0613,0.0274,0.0163,...,15350.0,17190.0,115.00,49720.0,389.200,13.650,4,2,60-69,1.0


In [10]:
tsne = TSNE(n_components=2, perplexity=30, n_iter=300, random_state=42)
reduced_data = tsne.fit_transform(merged_df.drop(columns=['Cluster', 'SEX', 'AGE','DTHHRDY']))
reduced_df = pd.DataFrame(data=reduced_data, columns=['Dim1', 'Dim2'])
reduced_df['Cluster'] = merged_df['Cluster'].values
reduced_df['AGE'] = merged_df['AGE'].values
reduced_df['DTHHRDY'] = merged_df['DTHHRDY'].values



In [11]:
plt.figure(figsize=(14, 10))

# First plot by Cluster
plt.subplot(3, 1, 1)
sns.scatterplot(x='Dim1', y='Dim2', hue='Cluster', palette= 'tab10', data=reduced_df, s=100, alpha=0.7)
plt.title('t-SNE of Gene Expression Data with K-means Clustering')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Second plot by Age
plt.subplot(3, 1, 2)
sns.scatterplot(x='Dim1', y='Dim2', hue='AGE', palette='tab10', data=reduced_df, s=100, alpha=0.7)
plt.title('t-SNE of Gene Expression Data by AGE')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Third plot by DTHHRDY
plt.subplot(3, 1, 3)
sns.scatterplot(x='Dim1', y='Dim2', hue='DTHHRDY', palette='tab10', data=reduced_df, s=100, alpha=0.7)
plt.title('t-SNE of Gene Expression Data by DTHHRDY')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')


plt.suptitle(f'{organs[6]}',fontsize=20, fontweight='bold')
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.savefig(f'./Images/{organs[6]}.png')
plt.close()