# Flair Transcript Quantification Results Analysis Part 2

Continuing from Part 1, in this notebook I did more data cleaning and data manipulation in order to prepare the final dataframes for creating visualizations in part 3 of notebook.

## Part 1: Import Data and Configure Python Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec
%matplotlib inline
import seaborn as sns
import re
from IPython.display import display
from matplotlib.pyplot import gcf
from sklearn.decomposition import PCA 
from sklearn.preprocessing import StandardScaler
from PIL import ImageColor
from matplotlib.patches import Patch #for custom legend making
import scipy.spatial as sp, scipy.cluster.hierarchy as hc #for faster computing of hierarchial clusters

In [2]:
#pd.options.display.max_columns = None #display all columns in dataframe

In [3]:
#pd.options.display.max_colwidth = 100 #show the full content of long strings

In [4]:
import warnings
warnings.filterwarnings('ignore')

### Import and Clean Data

In [5]:
os.getcwd()

'C:\\Users\\15082\\OneDrive\\Desktop\\thesis_research\\gtex_v9_data_analysis\\FLAIR'

In [6]:
data_dir = 'gtex_v9_data\\data_for_analysis\\gtex_database_data'

In [7]:
sample_info_path = os.path.join(data_dir, 'sample_info_complete.csv')
novel_transcript_quant_transposed_path = os.path.join(data_dir, 
                                                      'novel_transcript_quant_nofilter_transposed.csv')
annotated_transcript_quant_transposed_path = os.path.join(data_dir, 
                                                      'annotated_transcript_quant_nofilter_transposed.csv')

In [8]:
# change working directory
os.chdir('C:\\Users\\15082\\OneDrive\\Desktop\\thesis_research')

In [9]:
sample_info = pd.read_csv(sample_info_path)
novel_transcript_quant_transposed = pd.read_csv(novel_transcript_quant_transposed_path)
annotated_transcript_quant_transposed = pd.read_csv(annotated_transcript_quant_transposed_path)

#### Dataframe of Novel Transcripts

In [10]:
novel_transcript_quant_transposed.head(3)

Unnamed: 0.1,Unnamed: 0,sample_id,000187c4-a488-40f0-a69c-0a89582f3241_ENSG00000173848.18,00026598-3078-4e2f-8ac9-dd8f523396b9_ENSG00000102893.15_ENSG00000102893.15,0002a5e2-f01a-4690-a7db-7af726712a5e_ENSG00000071575.11_ENSG00000071575.11,000339f1-1769-4608-b369-59aa222cd7b7_ENSG00000166012.16_ENSG00000166012.16,0003706a-94a7-4419-a61d-6310d7a9c10c_ENSG00000101247.17_ENSG00000101247.17,0004a186-852c-448e-a74c-afdad4fdf319_ENSG00000204356.13_ENSG00000204356.13,0004af18-8dff-40d9-814d-32a45ded762e_ENSG00000007237.18_ENSG00000007237.18,0004bfb5-133e-4202-9562-4744a79252db_ENSG00000066697.14_ENSG00000066697.14,...,fff834dd-554d-46c7-b30a-b92e4be6dd61_ENSG00000131069.19,fff8c888-5eac-4f53-af8e-30a9aa66be58_ENSG00000140750.16,fff8d294-c55c-4453-9aa7-7d8db1e114e2_ENSG00000188917.14_ENSG00000188917.14,fff941ee-d99e-4f68-94b0-4b5ab30a789c_ENSG00000232973.11_ENSG00000232973.11,fffa0051-00fd-4289-86c4-114fa1fd36a1_ENSG00000197444.9_ENSG00000197444.9,fffa90eb-bfc8-49d3-9936-cd1d4ea47708_ENSG00000141030.12_ENSG00000141030.12,fffabc59-08f8-4f01-9835-099e5dfe5bd6_ENSG00000113407.13_ENSG00000113407.13,fffc8c18-b616-42ce-8c53-71b184c0cab8_ENSG00000156261.12_ENSG00000156261.12,fffd4120-f790-4c5d-b903-52dc4a97c8e7_ENSG00000137171.14_ENSG00000137171.14,fffe94ff-bd9f-4c45-b693-96945bf0f3dc_ENSG00000106049.8_ENSG00000106049.8
0,1,GTEX-1192X-0011-R10a-SM-4RXXZ,0.0,0.0,0.380702,0.0,14.847375,0.761404,0.0,0.0,...,0.0,0.0,0.0,8.756144,0.0,0.761404,0.0,0.0,0.0,0.0
1,2,GTEX-11H98-0011-R11b-SM-4SFLZ,0.0,0.0,5.359084,0.297727,18.459067,0.297727,0.0,0.595454,...,0.0,0.0,0.0,0.0,0.595454,2.679542,0.595454,0.297727,0.893181,0.297727
2,3,GTEX-11TTK-0011-R7b-SM-4TVFS,0.0,0.0,1.961873,0.0,17.656857,0.0,0.0,0.0,...,0.0,0.0,0.0,3.923746,0.0,0.0,0.0,0.0,0.980936,0.0


In [11]:
# drop the first column
novel_transcript_quant_transposed.drop(columns=novel_transcript_quant_transposed.columns[0], 
                                       axis=1, inplace=True)

In [12]:
novel_transcript_quant_transposed.head(3)

Unnamed: 0,sample_id,000187c4-a488-40f0-a69c-0a89582f3241_ENSG00000173848.18,00026598-3078-4e2f-8ac9-dd8f523396b9_ENSG00000102893.15_ENSG00000102893.15,0002a5e2-f01a-4690-a7db-7af726712a5e_ENSG00000071575.11_ENSG00000071575.11,000339f1-1769-4608-b369-59aa222cd7b7_ENSG00000166012.16_ENSG00000166012.16,0003706a-94a7-4419-a61d-6310d7a9c10c_ENSG00000101247.17_ENSG00000101247.17,0004a186-852c-448e-a74c-afdad4fdf319_ENSG00000204356.13_ENSG00000204356.13,0004af18-8dff-40d9-814d-32a45ded762e_ENSG00000007237.18_ENSG00000007237.18,0004bfb5-133e-4202-9562-4744a79252db_ENSG00000066697.14_ENSG00000066697.14,0005f2e9-ea77-45a6-9ae4-f8c2990b165e_ENSG00000100296.13_ENSG00000100296.13,...,fff834dd-554d-46c7-b30a-b92e4be6dd61_ENSG00000131069.19,fff8c888-5eac-4f53-af8e-30a9aa66be58_ENSG00000140750.16,fff8d294-c55c-4453-9aa7-7d8db1e114e2_ENSG00000188917.14_ENSG00000188917.14,fff941ee-d99e-4f68-94b0-4b5ab30a789c_ENSG00000232973.11_ENSG00000232973.11,fffa0051-00fd-4289-86c4-114fa1fd36a1_ENSG00000197444.9_ENSG00000197444.9,fffa90eb-bfc8-49d3-9936-cd1d4ea47708_ENSG00000141030.12_ENSG00000141030.12,fffabc59-08f8-4f01-9835-099e5dfe5bd6_ENSG00000113407.13_ENSG00000113407.13,fffc8c18-b616-42ce-8c53-71b184c0cab8_ENSG00000156261.12_ENSG00000156261.12,fffd4120-f790-4c5d-b903-52dc4a97c8e7_ENSG00000137171.14_ENSG00000137171.14,fffe94ff-bd9f-4c45-b693-96945bf0f3dc_ENSG00000106049.8_ENSG00000106049.8
0,GTEX-1192X-0011-R10a-SM-4RXXZ,0.0,0.0,0.380702,0.0,14.847375,0.761404,0.0,0.0,0.761404,...,0.0,0.0,0.0,8.756144,0.0,0.761404,0.0,0.0,0.0,0.0
1,GTEX-11H98-0011-R11b-SM-4SFLZ,0.0,0.0,5.359084,0.297727,18.459067,0.297727,0.0,0.595454,0.297727,...,0.0,0.0,0.0,0.0,0.595454,2.679542,0.595454,0.297727,0.893181,0.297727
2,GTEX-11TTK-0011-R7b-SM-4TVFS,0.0,0.0,1.961873,0.0,17.656857,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.923746,0.0,0.0,0.0,0.0,0.980936,0.0


#### Dataframe of Annotated Transcripts

In [13]:
annotated_transcript_quant_transposed.head(3)

Unnamed: 0.1,Unnamed: 0,sample_id,ENST00000000233.9_ENSG00000004059.10,ENST00000000412.7_ENSG00000003056.7,ENST00000001008.5_ENSG00000004478.7,ENST00000001146.6_ENSG00000003137.8,ENST00000002125.8_ENSG00000003509.15,ENST00000002165.10_ENSG00000001036.13,ENST00000002501.10_ENSG00000003249.13,ENST00000002596.5_ENSG00000002587.9,...,ENST00000640621.1_ENSG00000262633.2,ENST00000640621.1-1_ENSG00000262633.2,ENST00000640674.1_ENSG00000278175.3,ENST00000640752.1_ENSG00000138796.16,ENST00000640769.1_ENSG00000176225.13,ENST00000640799.1_ENSG00000143612.19,ENST00000640815.1_ENSG00000164199.16,ENST00000640876.1_ENSG00000197563.10,ENST00000640893.1_ENSG00000087258.14,ENST00000640967.1_ENSG00000082212.12
0,1,GTEX-1192X-0011-R10a-SM-4RXXZ,373.468579,20.557903,22.080711,0.0,3.045615,3.045615,5.710529,1.142106,...,2.664913,0.0,3.045615,0.380702,0.0,0.380702,1.522808,0.0,7.233336,0.380702
1,2,GTEX-11H98-0011-R11b-SM-4SFLZ,369.181337,16.374979,24.115878,2.084088,0.595454,2.679542,1.786361,25.902239,...,1.786361,0.0,1.786361,0.297727,0.893181,0.893181,0.595454,0.0,7.145445,0.297727
2,3,GTEX-11TTK-0011-R7b-SM-4TVFS,256.024421,9.809365,7.847492,0.0,0.980936,6.866555,0.980936,0.980936,...,2.942809,0.0,2.942809,0.0,0.980936,0.0,0.980936,0.0,4.904682,0.0


In [14]:
# drop the first column
annotated_transcript_quant_transposed.drop(columns=annotated_transcript_quant_transposed.columns[0], 
                                       axis=1, inplace=True)

In [15]:
annotated_transcript_quant_transposed.head(3)

Unnamed: 0,sample_id,ENST00000000233.9_ENSG00000004059.10,ENST00000000412.7_ENSG00000003056.7,ENST00000001008.5_ENSG00000004478.7,ENST00000001146.6_ENSG00000003137.8,ENST00000002125.8_ENSG00000003509.15,ENST00000002165.10_ENSG00000001036.13,ENST00000002501.10_ENSG00000003249.13,ENST00000002596.5_ENSG00000002587.9,ENST00000003100.12_ENSG00000001630.15,...,ENST00000640621.1_ENSG00000262633.2,ENST00000640621.1-1_ENSG00000262633.2,ENST00000640674.1_ENSG00000278175.3,ENST00000640752.1_ENSG00000138796.16,ENST00000640769.1_ENSG00000176225.13,ENST00000640799.1_ENSG00000143612.19,ENST00000640815.1_ENSG00000164199.16,ENST00000640876.1_ENSG00000197563.10,ENST00000640893.1_ENSG00000087258.14,ENST00000640967.1_ENSG00000082212.12
0,GTEX-1192X-0011-R10a-SM-4RXXZ,373.468579,20.557903,22.080711,0.0,3.045615,3.045615,5.710529,1.142106,9.517548,...,2.664913,0.0,3.045615,0.380702,0.0,0.380702,1.522808,0.0,7.233336,0.380702
1,GTEX-11H98-0011-R11b-SM-4SFLZ,369.181337,16.374979,24.115878,2.084088,0.595454,2.679542,1.786361,25.902239,9.229533,...,1.786361,0.0,1.786361,0.297727,0.893181,0.893181,0.595454,0.0,7.145445,0.297727
2,GTEX-11TTK-0011-R7b-SM-4TVFS,256.024421,9.809365,7.847492,0.0,0.980936,6.866555,0.980936,0.980936,1.961873,...,2.942809,0.0,2.942809,0.0,0.980936,0.0,0.980936,0.0,4.904682,0.0


#### Dataframe of Sample Id's and Sample Tissue Type

Select only sample id's and tissue type columns from the sample info data table.

In [16]:
sample_id_tissue = sample_info[['sample_id','tissue']]

In [17]:
sample_id_tissue.head(5)

Unnamed: 0,sample_id,tissue
0,LV1681,Heart - Left Ventricle
1,LV1702,Heart - Left Ventricle
2,LV1708,Heart - Left Ventricle
3,LV1723,Heart - Left Ventricle
4,GTEX-1192X-0011-R10a-SM-4RXXZ,Brain - Frontal Cortex (BA9)


## Part 2: Merge Dataframes and Further Cleaning

### Section 1: Merge novel transcript quant data table with tissue type information.

In [18]:
novel_transcript_express_tissue = novel_transcript_quant_transposed.merge(sample_id_tissue,how='left',
                                                                         left_on='sample_id',right_on='sample_id')

In [19]:
novel_transcript_express_tissue.head(5)

Unnamed: 0,sample_id,000187c4-a488-40f0-a69c-0a89582f3241_ENSG00000173848.18,00026598-3078-4e2f-8ac9-dd8f523396b9_ENSG00000102893.15_ENSG00000102893.15,0002a5e2-f01a-4690-a7db-7af726712a5e_ENSG00000071575.11_ENSG00000071575.11,000339f1-1769-4608-b369-59aa222cd7b7_ENSG00000166012.16_ENSG00000166012.16,0003706a-94a7-4419-a61d-6310d7a9c10c_ENSG00000101247.17_ENSG00000101247.17,0004a186-852c-448e-a74c-afdad4fdf319_ENSG00000204356.13_ENSG00000204356.13,0004af18-8dff-40d9-814d-32a45ded762e_ENSG00000007237.18_ENSG00000007237.18,0004bfb5-133e-4202-9562-4744a79252db_ENSG00000066697.14_ENSG00000066697.14,0005f2e9-ea77-45a6-9ae4-f8c2990b165e_ENSG00000100296.13_ENSG00000100296.13,...,fff8c888-5eac-4f53-af8e-30a9aa66be58_ENSG00000140750.16,fff8d294-c55c-4453-9aa7-7d8db1e114e2_ENSG00000188917.14_ENSG00000188917.14,fff941ee-d99e-4f68-94b0-4b5ab30a789c_ENSG00000232973.11_ENSG00000232973.11,fffa0051-00fd-4289-86c4-114fa1fd36a1_ENSG00000197444.9_ENSG00000197444.9,fffa90eb-bfc8-49d3-9936-cd1d4ea47708_ENSG00000141030.12_ENSG00000141030.12,fffabc59-08f8-4f01-9835-099e5dfe5bd6_ENSG00000113407.13_ENSG00000113407.13,fffc8c18-b616-42ce-8c53-71b184c0cab8_ENSG00000156261.12_ENSG00000156261.12,fffd4120-f790-4c5d-b903-52dc4a97c8e7_ENSG00000137171.14_ENSG00000137171.14,fffe94ff-bd9f-4c45-b693-96945bf0f3dc_ENSG00000106049.8_ENSG00000106049.8,tissue
0,GTEX-1192X-0011-R10a-SM-4RXXZ,0.0,0.0,0.380702,0.0,14.847375,0.761404,0.0,0.0,0.761404,...,0.0,0.0,8.756144,0.0,0.761404,0.0,0.0,0.0,0.0,Brain - Frontal Cortex (BA9)
1,GTEX-11H98-0011-R11b-SM-4SFLZ,0.0,0.0,5.359084,0.297727,18.459067,0.297727,0.0,0.595454,0.297727,...,0.0,0.0,0.0,0.595454,2.679542,0.595454,0.297727,0.893181,0.297727,Brain - Cerebellar Hemisphere
2,GTEX-11TTK-0011-R7b-SM-4TVFS,0.0,0.0,1.961873,0.0,17.656857,0.0,0.0,0.0,0.0,...,0.0,0.0,3.923746,0.0,0.0,0.0,0.0,0.980936,0.0,Brain - Putamen (basal ganglia)
3,GTEX-1211K-0826-SM-7LDFQ,0.0,0.0,4.590649,1.020144,4.080577,0.0,0.0,0.0,0.0,...,0.510072,0.0,0.510072,0.0,0.0,0.0,0.510072,0.0,0.510072,Lung
4,GTEX-1313W-0011-R7b-SM-4ZL3U,0.0,0.0,1.793796,0.0,10.762776,0.0,0.0,1.793796,0.0,...,0.0,0.0,3.587592,0.0,0.0,0.0,0.0,0.0,0.0,Brain - Putamen (basal ganglia)


In [20]:
novel_transcript_express_tissue.shape

(92, 72275)

In [21]:
# shift column 'tissue' to second position
novel_tissue_column = novel_transcript_express_tissue.pop('tissue')
novel_transcript_express_tissue.insert(1, 'tissue', novel_tissue_column)

In [22]:
novel_transcript_express_tissue.head(5)

Unnamed: 0,sample_id,tissue,000187c4-a488-40f0-a69c-0a89582f3241_ENSG00000173848.18,00026598-3078-4e2f-8ac9-dd8f523396b9_ENSG00000102893.15_ENSG00000102893.15,0002a5e2-f01a-4690-a7db-7af726712a5e_ENSG00000071575.11_ENSG00000071575.11,000339f1-1769-4608-b369-59aa222cd7b7_ENSG00000166012.16_ENSG00000166012.16,0003706a-94a7-4419-a61d-6310d7a9c10c_ENSG00000101247.17_ENSG00000101247.17,0004a186-852c-448e-a74c-afdad4fdf319_ENSG00000204356.13_ENSG00000204356.13,0004af18-8dff-40d9-814d-32a45ded762e_ENSG00000007237.18_ENSG00000007237.18,0004bfb5-133e-4202-9562-4744a79252db_ENSG00000066697.14_ENSG00000066697.14,...,fff834dd-554d-46c7-b30a-b92e4be6dd61_ENSG00000131069.19,fff8c888-5eac-4f53-af8e-30a9aa66be58_ENSG00000140750.16,fff8d294-c55c-4453-9aa7-7d8db1e114e2_ENSG00000188917.14_ENSG00000188917.14,fff941ee-d99e-4f68-94b0-4b5ab30a789c_ENSG00000232973.11_ENSG00000232973.11,fffa0051-00fd-4289-86c4-114fa1fd36a1_ENSG00000197444.9_ENSG00000197444.9,fffa90eb-bfc8-49d3-9936-cd1d4ea47708_ENSG00000141030.12_ENSG00000141030.12,fffabc59-08f8-4f01-9835-099e5dfe5bd6_ENSG00000113407.13_ENSG00000113407.13,fffc8c18-b616-42ce-8c53-71b184c0cab8_ENSG00000156261.12_ENSG00000156261.12,fffd4120-f790-4c5d-b903-52dc4a97c8e7_ENSG00000137171.14_ENSG00000137171.14,fffe94ff-bd9f-4c45-b693-96945bf0f3dc_ENSG00000106049.8_ENSG00000106049.8
0,GTEX-1192X-0011-R10a-SM-4RXXZ,Brain - Frontal Cortex (BA9),0.0,0.0,0.380702,0.0,14.847375,0.761404,0.0,0.0,...,0.0,0.0,0.0,8.756144,0.0,0.761404,0.0,0.0,0.0,0.0
1,GTEX-11H98-0011-R11b-SM-4SFLZ,Brain - Cerebellar Hemisphere,0.0,0.0,5.359084,0.297727,18.459067,0.297727,0.0,0.595454,...,0.0,0.0,0.0,0.0,0.595454,2.679542,0.595454,0.297727,0.893181,0.297727
2,GTEX-11TTK-0011-R7b-SM-4TVFS,Brain - Putamen (basal ganglia),0.0,0.0,1.961873,0.0,17.656857,0.0,0.0,0.0,...,0.0,0.0,0.0,3.923746,0.0,0.0,0.0,0.0,0.980936,0.0
3,GTEX-1211K-0826-SM-7LDFQ,Lung,0.0,0.0,4.590649,1.020144,4.080577,0.0,0.0,0.0,...,0.0,0.510072,0.0,0.510072,0.0,0.0,0.0,0.510072,0.0,0.510072
4,GTEX-1313W-0011-R7b-SM-4ZL3U,Brain - Putamen (basal ganglia),0.0,0.0,1.793796,0.0,10.762776,0.0,0.0,1.793796,...,0.0,0.0,0.0,3.587592,0.0,0.0,0.0,0.0,0.0,0.0


### Section 2: Merge annotated transcript quant data table with tissue type information.

In [23]:
annotated_transcript_express_tissue = annotated_transcript_quant_transposed.merge(sample_id_tissue,how='left',
                                                                         left_on='sample_id',right_on='sample_id')
annotated_transcript_express_tissue.head(5)

Unnamed: 0,sample_id,ENST00000000233.9_ENSG00000004059.10,ENST00000000412.7_ENSG00000003056.7,ENST00000001008.5_ENSG00000004478.7,ENST00000001146.6_ENSG00000003137.8,ENST00000002125.8_ENSG00000003509.15,ENST00000002165.10_ENSG00000001036.13,ENST00000002501.10_ENSG00000003249.13,ENST00000002596.5_ENSG00000002587.9,ENST00000003100.12_ENSG00000001630.15,...,ENST00000640621.1-1_ENSG00000262633.2,ENST00000640674.1_ENSG00000278175.3,ENST00000640752.1_ENSG00000138796.16,ENST00000640769.1_ENSG00000176225.13,ENST00000640799.1_ENSG00000143612.19,ENST00000640815.1_ENSG00000164199.16,ENST00000640876.1_ENSG00000197563.10,ENST00000640893.1_ENSG00000087258.14,ENST00000640967.1_ENSG00000082212.12,tissue
0,GTEX-1192X-0011-R10a-SM-4RXXZ,373.468579,20.557903,22.080711,0.0,3.045615,3.045615,5.710529,1.142106,9.517548,...,0.0,3.045615,0.380702,0.0,0.380702,1.522808,0.0,7.233336,0.380702,Brain - Frontal Cortex (BA9)
1,GTEX-11H98-0011-R11b-SM-4SFLZ,369.181337,16.374979,24.115878,2.084088,0.595454,2.679542,1.786361,25.902239,9.229533,...,0.0,1.786361,0.297727,0.893181,0.893181,0.595454,0.0,7.145445,0.297727,Brain - Cerebellar Hemisphere
2,GTEX-11TTK-0011-R7b-SM-4TVFS,256.024421,9.809365,7.847492,0.0,0.980936,6.866555,0.980936,0.980936,1.961873,...,0.0,2.942809,0.0,0.980936,0.0,0.980936,0.0,4.904682,0.0,Brain - Putamen (basal ganglia)
3,GTEX-1211K-0826-SM-7LDFQ,115.786376,22.443174,8.161154,0.0,0.510072,25.503607,0.510072,9.691371,19.382741,...,0.0,1.020144,0.0,1.020144,0.510072,0.510072,0.510072,0.0,0.0,Lung
4,GTEX-1313W-0011-R7b-SM-4ZL3U,263.688009,26.90694,23.319348,0.0,5.381388,5.381388,1.793796,1.793796,5.381388,...,0.0,0.0,1.793796,3.587592,1.793796,1.793796,0.0,10.762776,0.0,Brain - Putamen (basal ganglia)


In [24]:
annotated_transcript_express_tissue.shape

(92, 21359)

In [25]:
# shift column 'tissue' to second position
annotated_tissue_column = annotated_transcript_express_tissue.pop('tissue')
annotated_transcript_express_tissue.insert(1, 'tissue', annotated_tissue_column)

In [26]:
annotated_transcript_express_tissue.head(5)

Unnamed: 0,sample_id,tissue,ENST00000000233.9_ENSG00000004059.10,ENST00000000412.7_ENSG00000003056.7,ENST00000001008.5_ENSG00000004478.7,ENST00000001146.6_ENSG00000003137.8,ENST00000002125.8_ENSG00000003509.15,ENST00000002165.10_ENSG00000001036.13,ENST00000002501.10_ENSG00000003249.13,ENST00000002596.5_ENSG00000002587.9,...,ENST00000640621.1_ENSG00000262633.2,ENST00000640621.1-1_ENSG00000262633.2,ENST00000640674.1_ENSG00000278175.3,ENST00000640752.1_ENSG00000138796.16,ENST00000640769.1_ENSG00000176225.13,ENST00000640799.1_ENSG00000143612.19,ENST00000640815.1_ENSG00000164199.16,ENST00000640876.1_ENSG00000197563.10,ENST00000640893.1_ENSG00000087258.14,ENST00000640967.1_ENSG00000082212.12
0,GTEX-1192X-0011-R10a-SM-4RXXZ,Brain - Frontal Cortex (BA9),373.468579,20.557903,22.080711,0.0,3.045615,3.045615,5.710529,1.142106,...,2.664913,0.0,3.045615,0.380702,0.0,0.380702,1.522808,0.0,7.233336,0.380702
1,GTEX-11H98-0011-R11b-SM-4SFLZ,Brain - Cerebellar Hemisphere,369.181337,16.374979,24.115878,2.084088,0.595454,2.679542,1.786361,25.902239,...,1.786361,0.0,1.786361,0.297727,0.893181,0.893181,0.595454,0.0,7.145445,0.297727
2,GTEX-11TTK-0011-R7b-SM-4TVFS,Brain - Putamen (basal ganglia),256.024421,9.809365,7.847492,0.0,0.980936,6.866555,0.980936,0.980936,...,2.942809,0.0,2.942809,0.0,0.980936,0.0,0.980936,0.0,4.904682,0.0
3,GTEX-1211K-0826-SM-7LDFQ,Lung,115.786376,22.443174,8.161154,0.0,0.510072,25.503607,0.510072,9.691371,...,4.080577,0.0,1.020144,0.0,1.020144,0.510072,0.510072,0.510072,0.0,0.0
4,GTEX-1313W-0011-R7b-SM-4ZL3U,Brain - Putamen (basal ganglia),263.688009,26.90694,23.319348,0.0,5.381388,5.381388,1.793796,1.793796,...,3.587592,0.0,0.0,1.793796,3.587592,1.793796,1.793796,0.0,10.762776,0.0


### Section 3: Further Data Cleaning

In [27]:
np.unique(novel_transcript_express_tissue['tissue'])

array(['Adipose - Subcutaneous',
       'Brain - Anterior cingulate cortex (BA24)',
       'Brain - Caudate (basal ganglia)', 'Brain - Cerebellar Hemisphere',
       'Brain - Frontal Cortex (BA9)', 'Brain - Putamen (basal ganglia)',
       'Breast - Mammary Tissue', 'Cells - Cultured fibroblasts',
       'Heart - Atrial Appendage', 'Heart - Left Ventricle', 'K562',
       'Liver', 'Lung', 'Muscle - Skeletal', 'Pancreas'], dtype=object)

In [28]:
len(np.unique(novel_transcript_express_tissue['tissue']))

15

We will remove K562 samples from the dataset.

In [29]:
novel_transcript_express_tissue_clean = novel_transcript_express_tissue[~novel_transcript_express_tissue['tissue']\
                                                                        .isin(['K562'])]

In [30]:
novel_transcript_express_tissue_clean.shape

(88, 72275)

In [31]:
annotated_transcript_express_tissue_clean = annotated_transcript_express_tissue[~annotated_transcript_express_tissue['tissue']\
                                                                        .isin(['K562'])]

In [32]:
annotated_transcript_express_tissue_clean.shape

(88, 21359)

In [33]:
novel_transcript_express_tissue_clean.head(5)

Unnamed: 0,sample_id,tissue,000187c4-a488-40f0-a69c-0a89582f3241_ENSG00000173848.18,00026598-3078-4e2f-8ac9-dd8f523396b9_ENSG00000102893.15_ENSG00000102893.15,0002a5e2-f01a-4690-a7db-7af726712a5e_ENSG00000071575.11_ENSG00000071575.11,000339f1-1769-4608-b369-59aa222cd7b7_ENSG00000166012.16_ENSG00000166012.16,0003706a-94a7-4419-a61d-6310d7a9c10c_ENSG00000101247.17_ENSG00000101247.17,0004a186-852c-448e-a74c-afdad4fdf319_ENSG00000204356.13_ENSG00000204356.13,0004af18-8dff-40d9-814d-32a45ded762e_ENSG00000007237.18_ENSG00000007237.18,0004bfb5-133e-4202-9562-4744a79252db_ENSG00000066697.14_ENSG00000066697.14,...,fff834dd-554d-46c7-b30a-b92e4be6dd61_ENSG00000131069.19,fff8c888-5eac-4f53-af8e-30a9aa66be58_ENSG00000140750.16,fff8d294-c55c-4453-9aa7-7d8db1e114e2_ENSG00000188917.14_ENSG00000188917.14,fff941ee-d99e-4f68-94b0-4b5ab30a789c_ENSG00000232973.11_ENSG00000232973.11,fffa0051-00fd-4289-86c4-114fa1fd36a1_ENSG00000197444.9_ENSG00000197444.9,fffa90eb-bfc8-49d3-9936-cd1d4ea47708_ENSG00000141030.12_ENSG00000141030.12,fffabc59-08f8-4f01-9835-099e5dfe5bd6_ENSG00000113407.13_ENSG00000113407.13,fffc8c18-b616-42ce-8c53-71b184c0cab8_ENSG00000156261.12_ENSG00000156261.12,fffd4120-f790-4c5d-b903-52dc4a97c8e7_ENSG00000137171.14_ENSG00000137171.14,fffe94ff-bd9f-4c45-b693-96945bf0f3dc_ENSG00000106049.8_ENSG00000106049.8
0,GTEX-1192X-0011-R10a-SM-4RXXZ,Brain - Frontal Cortex (BA9),0.0,0.0,0.380702,0.0,14.847375,0.761404,0.0,0.0,...,0.0,0.0,0.0,8.756144,0.0,0.761404,0.0,0.0,0.0,0.0
1,GTEX-11H98-0011-R11b-SM-4SFLZ,Brain - Cerebellar Hemisphere,0.0,0.0,5.359084,0.297727,18.459067,0.297727,0.0,0.595454,...,0.0,0.0,0.0,0.0,0.595454,2.679542,0.595454,0.297727,0.893181,0.297727
2,GTEX-11TTK-0011-R7b-SM-4TVFS,Brain - Putamen (basal ganglia),0.0,0.0,1.961873,0.0,17.656857,0.0,0.0,0.0,...,0.0,0.0,0.0,3.923746,0.0,0.0,0.0,0.0,0.980936,0.0
3,GTEX-1211K-0826-SM-7LDFQ,Lung,0.0,0.0,4.590649,1.020144,4.080577,0.0,0.0,0.0,...,0.0,0.510072,0.0,0.510072,0.0,0.0,0.0,0.510072,0.0,0.510072
4,GTEX-1313W-0011-R7b-SM-4ZL3U,Brain - Putamen (basal ganglia),0.0,0.0,1.793796,0.0,10.762776,0.0,0.0,1.793796,...,0.0,0.0,0.0,3.587592,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
annotated_transcript_express_tissue_clean.head(5)

Unnamed: 0,sample_id,tissue,ENST00000000233.9_ENSG00000004059.10,ENST00000000412.7_ENSG00000003056.7,ENST00000001008.5_ENSG00000004478.7,ENST00000001146.6_ENSG00000003137.8,ENST00000002125.8_ENSG00000003509.15,ENST00000002165.10_ENSG00000001036.13,ENST00000002501.10_ENSG00000003249.13,ENST00000002596.5_ENSG00000002587.9,...,ENST00000640621.1_ENSG00000262633.2,ENST00000640621.1-1_ENSG00000262633.2,ENST00000640674.1_ENSG00000278175.3,ENST00000640752.1_ENSG00000138796.16,ENST00000640769.1_ENSG00000176225.13,ENST00000640799.1_ENSG00000143612.19,ENST00000640815.1_ENSG00000164199.16,ENST00000640876.1_ENSG00000197563.10,ENST00000640893.1_ENSG00000087258.14,ENST00000640967.1_ENSG00000082212.12
0,GTEX-1192X-0011-R10a-SM-4RXXZ,Brain - Frontal Cortex (BA9),373.468579,20.557903,22.080711,0.0,3.045615,3.045615,5.710529,1.142106,...,2.664913,0.0,3.045615,0.380702,0.0,0.380702,1.522808,0.0,7.233336,0.380702
1,GTEX-11H98-0011-R11b-SM-4SFLZ,Brain - Cerebellar Hemisphere,369.181337,16.374979,24.115878,2.084088,0.595454,2.679542,1.786361,25.902239,...,1.786361,0.0,1.786361,0.297727,0.893181,0.893181,0.595454,0.0,7.145445,0.297727
2,GTEX-11TTK-0011-R7b-SM-4TVFS,Brain - Putamen (basal ganglia),256.024421,9.809365,7.847492,0.0,0.980936,6.866555,0.980936,0.980936,...,2.942809,0.0,2.942809,0.0,0.980936,0.0,0.980936,0.0,4.904682,0.0
3,GTEX-1211K-0826-SM-7LDFQ,Lung,115.786376,22.443174,8.161154,0.0,0.510072,25.503607,0.510072,9.691371,...,4.080577,0.0,1.020144,0.0,1.020144,0.510072,0.510072,0.510072,0.0,0.0
4,GTEX-1313W-0011-R7b-SM-4ZL3U,Brain - Putamen (basal ganglia),263.688009,26.90694,23.319348,0.0,5.381388,5.381388,1.793796,1.793796,...,3.587592,0.0,0.0,1.793796,3.587592,1.793796,1.793796,0.0,10.762776,0.0


#### Export Dataframes

In [35]:
os.getcwd()

'C:\\Users\\15082\\OneDrive\\Desktop\\thesis_research'

In [36]:
#novel_transcript_express_tissue_clean.to_csv('gtex_v9_data\\data_for_analysis\\gtex_database_data\\flair_novel_transcript_quant_clean.csv', sep=',')

In [37]:
#annotated_transcript_express_tissue_clean.to_csv('gtex_v9_data\\data_for_analysis\\gtex_database_data\\flair_annotated_transcript_quant_clean.csv', sep=',')

### Section 4: Data Filtering

Keep transcripts with expression above 5 TPM in at least 3 samples.

In [38]:
def transcript_filter(df,TPM_threshold,sample_count_min):
    selected_col = df[df.columns.difference(['sample_id', 'tissue'])]
    mask = selected_col > TPM_threshold
    column_counts = mask.sum(axis=0)
    columns_to_keep = column_counts[column_counts >= sample_count_min].index
    df_filtered = df[columns_to_keep]
    df_filtered['sample_id'] = df['sample_id']
    df_filtered['tissue'] = df['tissue']
    # shift column 'sample_id' to first position
    id_column = df_filtered.pop('sample_id')
    df_filtered.insert(0, 'sample_id', id_column)
    # shift column 'tissue' to second position
    tissue_column = df_filtered.pop('tissue')
    df_filtered.insert(1, 'tissue', tissue_column)
    return df_filtered

#### Novel Transcripts

In [39]:
novel_df_filtered = transcript_filter(novel_transcript_express_tissue_clean,5,3)

In [40]:
novel_df_filtered.shape

(88, 18219)

In [41]:
novel_df_filtered.head(3)

Unnamed: 0,sample_id,tissue,0002a5e2-f01a-4690-a7db-7af726712a5e_ENSG00000071575.11_ENSG00000071575.11,0003706a-94a7-4419-a61d-6310d7a9c10c_ENSG00000101247.17_ENSG00000101247.17,0004bfb5-133e-4202-9562-4744a79252db_ENSG00000066697.14_ENSG00000066697.14,0005f2e9-ea77-45a6-9ae4-f8c2990b165e_ENSG00000100296.13_ENSG00000100296.13,0016b099-8dcc-460d-9d7d-dc9443a490a1_ENSG00000131507.10_ENSG00000131507.10,0024f966-977a-4085-bfad-70aab00544c9_chr20:45970000_chr20:45970000,00296fb4-cf8b-4405-837e-2366b814b986_ENSG00000075884.13_ENSG00000075884.13,0030624f-ad69-4841-957f-e659b079b3ab_ENSG00000114019.14_ENSG00000114019.14,...,ffc1795c-a0a5-48c7-96b6-6d25829849b0_chr1:172366000_chr1:172366000,ffc35d5c-68b8-443f-86b0-6b5f4f57d199_ENSG00000168002.11_ENSG00000168002.11,ffc48bde-53f4-45df-8912-cfb9fa25bcfa-1_ENSG00000104611.11_ENSG00000104611.11,ffc52f34-a6c6-4dac-af97-6d2f4f27db65_ENSG00000168003.16_ENSG00000168003.16,ffcab65e-81fc-4819-9a34-9bd7179b9fb5_ENSG00000117425.13_ENSG00000117425.13,ffcbb15a-0409-471f-97e2-909c083084ad_ENSG00000066044.14_ENSG00000066044.14,ffcd666a-24a9-4d16-8cc3-9922b6bdd8d6_chr8:24953000_ENSG00000277586.2,ffdd9470-700c-4445-b294-12f7640a4779_ENSG00000082153.17_ENSG00000082153.17,ffe59936-8224-4b94-8444-53aa6229ee91_ENSG00000011009.10_ENSG00000011009.10,fff2fd1d-3961-4657-bf61-1637498fce2a_ENSG00000143624.13
0,GTEX-1192X-0011-R10a-SM-4RXXZ,Brain - Frontal Cortex (BA9),0.380702,14.847375,0.0,0.761404,57.866691,0.761404,0.380702,0.380702,...,5.710529,100.886008,1.142106,1.522808,2.284211,0.0,4.568423,1.90351,1.142106,3.807019
1,GTEX-11H98-0011-R11b-SM-4SFLZ,Brain - Cerebellar Hemisphere,5.359084,18.459067,0.595454,0.297727,54.781747,2.084088,0.0,0.0,...,39.597676,160.177064,0.0,8.336353,10.122714,2.977269,2.381815,0.595454,7.145445,10.718168
2,GTEX-11TTK-0011-R7b-SM-4TVFS,Brain - Putamen (basal ganglia),1.961873,17.656857,0.0,0.0,102.99833,1.961873,0.980936,0.0,...,29.428094,117.712378,0.0,5.885619,4.904682,4.904682,1.961873,0.0,0.0,16.67592


#### Annotated Transcripts

In [42]:
annotated_df_filtered = transcript_filter(annotated_transcript_express_tissue_clean,5,3)

In [43]:
annotated_df_filtered.shape

(88, 12048)

In [44]:
annotated_df_filtered.head(3)

Unnamed: 0,sample_id,tissue,ENST00000000233.9_ENSG00000004059.10,ENST00000000412.7_ENSG00000003056.7,ENST00000001008.5_ENSG00000004478.7,ENST00000001146.6_ENSG00000003137.8,ENST00000002125.8_ENSG00000003509.15,ENST00000002165.10_ENSG00000001036.13,ENST00000002501.10_ENSG00000003249.13,ENST00000002596.5_ENSG00000002587.9,...,ENST00000640051.1_ENSG00000262633.2,ENST00000640099.1_ENSG00000089818.17,ENST00000640218.1_ENSG00000153187.18,ENST00000640443.1_ENSG00000108433.16,ENST00000640608.1_ENSG00000262633.2,ENST00000640610.1_ENSG00000007372.21,ENST00000640621.1_ENSG00000262633.2,ENST00000640769.1_ENSG00000176225.13,ENST00000640815.1_ENSG00000164199.16,ENST00000640893.1_ENSG00000087258.14
0,GTEX-1192X-0011-R10a-SM-4RXXZ,Brain - Frontal Cortex (BA9),373.468579,20.557903,22.080711,0.0,3.045615,3.045615,5.710529,1.142106,...,17.131586,2.284211,4.568423,4.187721,3.045615,0.380702,2.664913,0.0,1.522808,7.233336
1,GTEX-11H98-0011-R11b-SM-4SFLZ,Brain - Cerebellar Hemisphere,369.181337,16.374979,24.115878,2.084088,0.595454,2.679542,1.786361,25.902239,...,5.954538,4.465903,8.63408,7.443172,1.786361,2.679542,1.786361,0.893181,0.595454,7.145445
2,GTEX-11TTK-0011-R7b-SM-4TVFS,Brain - Putamen (basal ganglia),256.024421,9.809365,7.847492,0.0,0.980936,6.866555,0.980936,0.980936,...,5.885619,0.0,2.942809,2.942809,0.980936,0.0,2.942809,0.980936,0.980936,4.904682


#### Export dataframe

In [45]:
#novel_df_filtered.to_csv('gtex_v9_data\\data_for_analysis\\gtex_database_data\\flair_novel_transcript_quant_clean_filtered.csv', sep=',')

In [46]:
#annotated_df_filtered.to_csv('gtex_v9_data\\data_for_analysis\\gtex_database_data\\flair_annotated_transcript_quant_clean_filtered.csv', sep=',')