# Get relevant gene list.

### Read and manipulate gtf file

In [35]:
import pandas as pd
import re
import numpy as np
import json

In [36]:
PATH_GTF_FILE = "/homes/mcolombari/AI_for_Bioinformatics_Project/Personal/gencode.v47.annotation.gtf"
OUTPUT_FOLDER_GENE_ID = "."
SAVE_GENE_ID = True

In [37]:
gtf = pd.read_csv(PATH_GTF_FILE, sep="\t", header=None, comment='#')

In [38]:
gtf.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']

parameters = ['gene_id', 'gene_type']
for p in parameters:
    gtf[p] = gtf['attribute'].apply(lambda x: re.findall(rf'{p} "([^"]*)"', x)[0] if rf'{p} "' in x else np.nan)

gtf.drop('attribute', axis=1, inplace=True)

print(gtf)

        seqname   source     feature  start    end score strand frame  \
0          chr1   HAVANA        gene  11121  24894     .      +     .   
1          chr1   HAVANA  transcript  11121  14413     .      +     .   
2          chr1   HAVANA        exon  11121  11211     .      +     .   
3          chr1   HAVANA        exon  12010  12227     .      +     .   
4          chr1   HAVANA        exon  12613  12721     .      +     .   
...         ...      ...         ...    ...    ...   ...    ...   ...   
4105480    chrM  ENSEMBL  transcript  15888  15953     .      +     .   
4105481    chrM  ENSEMBL        exon  15888  15953     .      +     .   
4105482    chrM  ENSEMBL        gene  15956  16023     .      -     .   
4105483    chrM  ENSEMBL  transcript  15956  16023     .      -     .   
4105484    chrM  ENSEMBL        exon  15956  16023     .      -     .   

                   gene_id gene_type  
0        ENSG00000290825.2    lncRNA  
1        ENSG00000290825.2    lncRNA  
2     

In [39]:
print(gtf.columns)

Index(['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand',
       'frame', 'gene_id', 'gene_type'],
      dtype='object')


In [40]:
gtf_pc = gtf[gtf['gene_type'] == 'protein_coding']
print(gtf_pc)

        seqname   source      feature  start    end score strand frame  \
2486       chr1   HAVANA         gene  65419  71585     .      +     .   
2487       chr1   HAVANA   transcript  65419  71585     .      +     .   
2488       chr1   HAVANA         exon  65419  65433     .      +     .   
2489       chr1   HAVANA         exon  65520  65573     .      +     .   
2490       chr1   HAVANA          CDS  65565  65573     .      +     0   
...         ...      ...          ...    ...    ...   ...    ...   ...   
4105474    chrM  ENSEMBL         gene  14747  15887     .      +     .   
4105475    chrM  ENSEMBL   transcript  14747  15887     .      +     .   
4105476    chrM  ENSEMBL         exon  14747  15887     .      +     .   
4105477    chrM  ENSEMBL          CDS  14747  15887     .      +     0   
4105478    chrM  ENSEMBL  start_codon  14747  14749     .      +     0   

                   gene_id       gene_type  
2486     ENSG00000186092.7  protein_coding  
2487     ENSG00000186

### Save Gene id relative to the protein coding

In [41]:
gtf_pc_set = set(gtf_pc['gene_id'].to_list())
print(len(gtf_pc_set))

20092


Value in output match with the stimated gene with are proteine coding.
source: [link](https://www.genome.gov/genetics-glossary/Gene#:~:text=And%20genes%20are%20the%20part,of%20the%20entire%20human%20genome.)

In [42]:
if SAVE_GENE_ID:
    with open(OUTPUT_FOLDER_GENE_ID + "/" + 'gene_id_protein_coding.json', 'w', encoding='utf-8') as f:
        json.dump(list(gtf_pc_set), f, ensure_ascii=False, indent=4) 

# Actual proprocessing

### Now parse load the data and parse it

In [46]:
import torch
import os

In [47]:
PATH_FOLDER_GENE = "/work/h2020deciderficarra_shared/TCGA/OV/project_n16_data/GeneExpression"
PATH_CASE_ID_STRUCTURE = "./case_id_and_structure.json"

In [48]:
with open(PATH_CASE_ID_STRUCTURE, 'r') as file:
    file_parsed = json.load(file)

In [53]:
file_to_case_id = dict((file_parsed[k]['files']['gene'], k) for k in file_parsed.keys())
file_to_os = dict((file_parsed[k]['files']['gene'], file_parsed[k]['os']) for k in file_parsed.keys())

In [67]:
datastructure = pd.DataFrame(columns=['case_id', 'os', 'values'])

# All possibilitys.
# feature_to_save = [
#     'unstranded', 'stranded_first', 'stranded_second',
#     'tpm_unstranded', 'fpkm_unstranded', 'fpkm_uq_unstranded'
#     ]
feature_to_save = ['unstranded']

index = 0
# Now explore data path to get the right files
for root, dirs, files in os.walk(PATH_FOLDER_GENE):
    for dir in dirs:
        for root, dirs, files in os.walk(PATH_FOLDER_GENE + "/" + dir):
            for file in files:
                if file in file_to_case_id.keys():
                    parsed_file = pd.read_csv(PATH_FOLDER_GENE + "/" + dir + "/" + file,
                                              sep='\t', header=0, skiprows=lambda x: x in [0, 2, 3, 4, 5])
                    parsed_file = parsed_file[['gene_id'] + feature_to_save]
                    
                    # They actually don't match.
                    # So the 'gene_type' in the dataset don't match the in the gtf file.
                    # So i'm gonna use as the right reference the gtf file.

                    # parsed_file = parsed_file[parsed_file['gene_type'] == 'protein_coding']
                    # if not set(parsed_file['gene_id']).issubset(gtf_pc_set):
                    #     raise Exception("List of coding genes don't match.")

                    parsed_file = parsed_file[parsed_file['gene_id'].isin(gtf_pc_set)]

                    datastructure.loc[index] = [
                        file_to_case_id[file],
                        file_to_os[file],
                        parsed_file
                    ]
                    index += 1

In [68]:
datastructure.loc[0]['values']

Unnamed: 0,gene_id,unstranded
1,ENSG00000000005.6,2
3,ENSG00000000457.14,882
4,ENSG00000000460.17,427
5,ENSG00000000938.13,195
7,ENSG00000001036.14,1727
...,...,...
60650,ENSG00000288661.1,0
60655,ENSG00000288669.1,0
60657,ENSG00000288671.1,0
60658,ENSG00000288674.1,7
