In [2]:
import os
import re
from collections import OrderedDict
import numpy as np


In [43]:
########################################
##### gencode.v26.long_noncoding_RNAs

fh = open('gencode.v26.long_noncoding_RNAs.clean.txt', 'wt')
fh.write('geneID\tgeneName\tbiotype\tgeneTag\tSource\n')
with open('gencode.v26.long_noncoding_RNAs.gtf') as f:
    for line in f:
        line = line.rstrip()
        if line.startswith('#'):
            continue
        lst = line.split('\t')
        if lst[2] == 'gene':
            source = lst[1]
            geneInfo = lst[-1].split(';')
            geneId = geneInfo[0].split('"')[1]
            geneName = geneInfo[2].split('"')[1]
            biotype = geneInfo[1].split('"')[1]
            if re.search('tag', line):
                geneTag = geneInfo[4].split('"')[1]
            else:
                geneTag = 'NA'
            fh.write('\t'.join([geneId, geneName, biotype, geneTag, source])+'\n')
            
fh.close()

In [45]:
gencode = OrderedDict()

fh = open('TCGA.gencode.v26.long_noncoding_RNAs.clean.txt', 'wt')
fh.write('Ensembl\tSymbol\tBiotype\n')

with open('gencode.v26.long_noncoding_RNAs.clean.txt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split('\t')
        
        if lst[0] == 'geneID':
            continue
            
        gencode[lst[0].split('.')[0]] = lst[1]+'\t'+lst[2]
        
with open ('TCGA_gene_list.txt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split('\t')
        
        gene = lst[0].split('.')[0]
        
        if gene.startswith('__'):
            continue
        
        if gene not in gencode.keys():
            fh.write(lst[0]+'\tNA\tNA\n')
        else:
            fh.write(lst[0]+'\t'+gencode[gene]+'\n')
            
            
fh.close()

In [2]:
gencode = OrderedDict()

with open('gencode.v26.long_noncoding_RNAs.clean.txt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split('\t')
        
        if lst[0] == 'geneID':
            continue
            
        gencode[lst[0].split('.')[0]] = lst[1]

In [9]:
fh = open('Table_S1.Differentially_expressed_lncRNAs.symbol.txt', 'wt')

with open('Table_S1.Differentially_expressed_lncRNAs.txt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split('\t')
        
        if lst[0].startswith('logFC'):
            fh.write('\t'+'symbol'+'\t'+line+'\n')
            
        else:
            gene = lst[0].split('.')[0]
            fh.write(lst[0]+'\t'+gencode[gene]+'\t'+'\t'.join(lst[1:])+'\n')
            
fh.close()

In [10]:
fh = open('NEW/NEW.lncRNA.univariate.coxph.symbol.txt', 'wt')

with open('NEW/NEW.lncRNA.univariate.coxph.txt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split('\t')
        
        if lst[0].startswith('coef'):
            fh.write('\t'+'symbol'+'\t'+line+'\n')
            
        else:
            gene = lst[0].split('.')[0]
            fh.write(lst[0]+'\t'+gencode[gene]+'\t'+'\t'.join(lst[1:])+'\n')
            
fh.close()

In [8]:
############# input for survival analysis

def therapy_info(x):
    x = np.array(x)
    y = ['NA']*x.shape[1]
    for i in range(x.shape[1]):
        z = x[:,i]
        if 'YES' in z or 'Chemotherapy' in z:
            y[i] = 'YES'
        else:
            y[i] = 'NO'
    return (y)

def therapy_info2(x):
    x = np.array(x)
    y = ['NA']*x.shape[1]
    for i in range(x.shape[1]):
        z = x[:,i]
        if 'YES' in z or 'Chemotherapy' in z:
            y[i] = 'YES'
        elif 'NO' in z:
            y[i] = 'NO'
        else:
            y[i] = 'NA'
    return (y)


def max_trait(x):
    x = np.array(x)
    x[x == 'NA'] = np.nan
    x = x.astype(float)
    x = np.nanmax(x, axis=0)
    x = ['NA' if np.isnan(v) else str(v).replace('.0', '') for v in x]
    return (x)


fh = open('Clinical_data.clean.TCGA-CESC.txt', 'wt')

survivalDa = OrderedDict()

for trt in ['chemo','radiation','targeted_molecular','days_to_last_followup','days_to_death',
           'primary_therapy_outcome_success','new_tumor_event_after_initial_treatment']:
    survivalDa[trt]=[]


traits = ['gender','ethnicity','race','pathologic_stage', 
          'pathologic_T', 'pathologic_N', 'pathologic_M',
          'residual_tumor','tobacco_smoking_history',
          'number_pack_years_smoked','days_to_new_tumor_event_after_initial_treatment',
          'kras_mutation_found'] #,'person_neoplasm_cancer_status',

         
with open('Merged_clinical_data.TCGA-CESC.txt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split('\t')
        
        if lst[0] == '':
            samples = [sam for sam in lst if sam.startswith('TCGA')]
            fh.write('\t'+'\t'.join(samples)+'\n')
        elif lst[0] == 'age_at_initial_pathologic_diagnosis':
            fh.write('age'+'\t'+'\t'.join(lst[1:])+'\n')
        elif lst[0] in traits:
            lst = [trt.replace(' ', '') for trt in lst]
            fh.write('\t'.join(lst)+'\n')
            
        elif lst[0].startswith('days_to_last_followup'):
            survivalDa['days_to_last_followup'].append(lst[1:])
            
        elif lst[0].startswith('days_to_death'):
            survivalDa['days_to_death'].append(lst[1:])
            
            
        elif lst[0].startswith('therapy_type'):
            survivalDa['chemo'].append(lst[1:])
            
        elif lst[0].startswith('radiation_therapy'):
            survivalDa['radiation'].append(lst[1:])
            
        elif lst[0].startswith('targeted_molecular_therapy'):
            survivalDa['targeted_molecular'].append(lst[1:])
            
            
        elif lst[0].startswith('primary_therapy_outcome_success'):
            survivalDa['primary_therapy_outcome_success'].append(lst[1:])
        elif lst[0].startswith('new_tumor_event_after_initial_treatment'):
            survivalDa['new_tumor_event_after_initial_treatment'].append(lst[1:])

            
for ke in ['chemo','radiation','targeted_molecular']:
    fh.write(ke+'\t'+'\t'.join(therapy_info2(survivalDa[ke]))+'\n')
    
#fh.write('outcome\t'+'\t'.join(therapy_info(survivalDa['primary_therapy_outcome_success']))+'\n')
fh.write('neoplasm_recurrence\t'+'\t'.join(therapy_info2(survivalDa['new_tumor_event_after_initial_treatment']))+'\n')


for ke in ['days_to_death','days_to_last_followup']:
    fh.write(ke+'\t'+'\t'.join(max_trait(survivalDa[ke]))+'\n')


vital_status = [0 if x == 'NA' else 1 for x in max_trait(survivalDa['days_to_death'])]
fh.write('vital_status'+'\t'+'\t'.join(map(str,vital_status))+'\n')


fh.close()



In [9]:
fh = open('Clinical_data.final.TCGA-CESC.txt', 'wt')

patients = []
samples = []
with open('basic_clinical.CESC.txt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split('\t')
        if lst[0].startswith('TCGA'):
            samInfo = lst[0].split('-')
            if samInfo[-1].startswith('01'):
                patients.append(lst[0][:-4])
                samples.append(lst[0])

with open('Clinical_data.clean.TCGA-CESC.txt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split('\t')
        
        if lst[0] == '':
            samIndex = [lst.index(sam) for sam in patients if sam in lst]
            finalSam = [lst[i] for i in samIndex]
            
            fh.write('\t'+'\t'.join(samples)+'\n')
        else:
            trait = [lst[i] for i in samIndex]
            fh.write(lst[0]+'\t'+'\t'.join(trait)+'\n')
            
            
fh.close()
            

In [20]:
for i in range(len(patients)):
    if patients[i] == finalSam[i]:
        continue
    else:
        print('NO')

In [19]:
len(patients)

501

In [12]:
'TCGA-77-8156-02A'[:-4]

'TCGA-77-8156'

In [4]:
sam = []
with open('basic_clinical.txt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split('\t')
        if lst[0].startswith('TCGA'):
            samInfo = lst[0].split('-')
            if samInfo[-1] == '11A':
                continue
            elif '-'.join(samInfo[0:3]) in sam:
                print ('-'.join(samInfo[0:3]))
                
            else:
                sam.append('-'.join(samInfo[0:3]))

TCGA-21-1076


In [8]:
ensembl = OrderedDict()

fh = open('TCGA.Homo_sapiens.GRCh38.86.biotype.txt', 'wt')
fh.write('Ensembl\tSymbol\tBiotype\n')

with open('Homo_sapiens.GRCh38.86.biotype.txt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split('\t')
        
        if lst[0] == 'geneID':
            continue
            
        ensembl[lst[0].split('.')[0]] = lst[1]+'\t'+lst[2]
        
with open ('TCGA_gene_list.txt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split('\t')
        
        gene = lst[0].split('.')[0]
        
        if gene.startswith('__'):
            continue
        
        if gene not in ensembl.keys():
            fh.write(lst[0]+'\tNA\tNA\n')
        else:
            fh.write(lst[0]+'\t'+ensembl[gene]+'\n')
            
            
fh.close()

In [2]:
fh = open('RNAseq.counts_only.TCGA-LUSC.clean.txt', 'wt')

with open('RNAseq.counts_only.TCGA-LUSC.txt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split('\t')
        
        if lst[0].startswith('__'):
            print (lst[0])
            continue
            
        fh.write('\t'.join(lst[0:513]+lst[514:])+'\n')
            
fh.close()
        

__no_feature
__ambiguous
__too_low_aQual
__not_aligned
__alignment_not_unique


In [3]:
import os
import re
from collections import OrderedDict
import numpy as np

os.chdir('/home/ruidong/Documents/Research/share/lncRNA-LUSC-LUAD/NEW')

In [8]:
ensembl = OrderedDict()

with open('../Homo_sapiens.GRCh38.86.biotype.txt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split('\t')
        
        if lst[0] == 'geneID':
            continue
            
        ensembl[lst[0].split('.')[0]] = lst[1]+'\t'+lst[2]
        ensembl[lst[1]] = lst[2]

In [15]:
fh = open('test.txt','wt')

with open('MEM.gene3.RNAseq.txt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split('\t')
        if lst[1] not in ensembl.keys():
            fh.write(lst[1]+'\t'+'NA'+'\n')
        else:
            fh.write(lst[1]+'\t'+ensembl[lst[1]]+'\n')
            
fh.close()

In [7]:
ensembl['ENSG00000273066']

'RP11-216L13.19\tlong_non_coding'

In [27]:
files = reversed(sorted(os.listdir('Ensembl/')))
ensembl = OrderedDict()
i = -1
for fl in files:
    if not fl.endswith('gtf.txt'):
        continue
        
    else:
        i += 2
        print (i)
        with open('Ensembl/'+fl) as f:
            for line in f:
                line = line.rstrip()
                
                line = line.replace('"','')
                lst = line.split('\t')
        
                if line.startswith('#'):
                    continue
        
                mch = re.search('gene_id (\S+);', lst[-1])
                geneID = mch.group(1)
        
                mch = re.search('gene_name (\S+);', lst[-1])
                geneName = mch.group(1)
        
                mch = re.search('gene_biotype (\S+);', lst[-1])
                geneType = mch.group(1)
        
                if geneID not in ensembl.keys():
                    ensembl[geneID] = ['NA']*16
            
                ensembl[geneID][i-1] = geneName
                ensembl[geneID][i] = geneType
                
fh = open('Ensembl/test.txt', 'wt')

for ke, val in ensembl.items():
    fh.write(ke+'\t'+'\t'.join(val)+'\n')
    
fh.close()

1
3
5
7
9
11
13
15


In [16]:

ensembl = OrderedDict()
i = -1

with open('Ensembl/Homo_sapiens.GRCh38.90.gtf.txt') as f:
    i += 1
    for line in f:
        line = line.rstrip()
        line = line.replace('"','')
        lst = line.split('\t')
        
        if line.startswith('#'):
            continue
        
        mch = re.search('gene_id (\S+);', lst[-1])
        geneID = mch.group(1)
        
        mch = re.search('gene_name (\S+);', lst[-1])
        geneName = mch.group(1)
        
        mch = re.search('gene_biotype (\S+);', lst[-1])
        geneType = mch.group(1)
        
        if geneID not in ensembl.keys():
            ensembl[geneID] = ['NA']*7
            
        ensembl[geneID][i] = geneName
        

In [25]:
fh = open('Ensembl/test.txt', 'wt')

for ke, val in ensembl.items():
    fh.write(ke+'\t'+'\t'.join(val)+'\n')
    
fh.close()

In [16]:
files = reversed(sorted(os.listdir('Ensembl/19/')))
ensembl = OrderedDict()
i = -1
for fl in files:
    if not fl.endswith('gtf.txt'):
        continue
        
    else:
        i += 1
        print (i)
        with open('Ensembl/19/'+fl) as f:
            for line in f:
                line = line.rstrip()
                
                line = line.replace('"','')
                lst = line.split('\t')
        
                if line.startswith('#'):
                    continue
        
                mch = re.search('gene_id (\S+);', lst[-1])
                geneID = mch.group(1)
        
                mch = re.search('gene_name (\S+);', lst[-1])
                geneName = mch.group(1)
        
                mch = re.search('gene_biotype (\S+);', lst[-1])
                geneType = mch.group(1)
        
                if geneID not in ensembl.keys():
                    ensembl[geneID] = ['NA']*10
            
                ensembl[geneID][i] = geneName
                
fh = open('Ensembl/test.txt', 'wt')

for ke, val in ensembl.items():
    fh.write(ke+'\t'+'\t'.join(val)+'\n')
    
fh.close()

0
1
2
3
4
5
6
7
8
9


In [5]:
########################################
##### gencode.v26.long_noncoding_RNAs


gencode = OrderedDict()

with open('gencode/gencode.v27.annotation.gtf') as f:
    for line in f:
        line = line.rstrip()
        if line.startswith('#'):
            continue
        lst = line.split('\t')
        if lst[2] == 'gene':
            geneInfo = lst[-1].split(';')
            geneId = geneInfo[0].split('"')[1].split('.')[0]
            geneName = geneInfo[2].split('"')[1]
            biotype = geneInfo[1].split('"')[1]
            
            if geneId not in gencode.keys():
                gencode[geneId] = ['NA']*3
                
            gencode[geneId][0] = geneName
            
#            fh.write('\t'.join([geneId, geneName, biotype, geneTag, source])+'\n')

with open('gencode/gencode.v20.annotation.gtf') as f:
    for line in f:
        line = line.rstrip()
        if line.startswith('#'):
            continue
        lst = line.split('\t')
        if lst[2] == 'gene':
            geneInfo = lst[-1].split(';')
            geneId = geneInfo[0].split('"')[1].split('.')[0]
            geneName = geneInfo[4].split('"')[1]
            biotype = geneInfo[1].split('"')[1]
            
            if geneId not in gencode.keys():
                gencode[geneId] = ['NA']*3
                
            gencode[geneId][1] = geneName
            
            
            
with open('gencode/gencode.v19.annotation.gtf_withproteinids') as f:
    for line in f:
        line = line.rstrip()
        if line.startswith('#'):
            continue
        lst = line.split('\t')
        if lst[2] == 'gene':
            geneInfo = lst[-1].split(';')
            geneId = geneInfo[0].split('"')[1].split('.')[0]
            geneName = geneInfo[4].split('"')[1]
            biotype = geneInfo[1].split('"')[1]
            
            if geneId not in gencode.keys():
                gencode[geneId] = ['NA']*3
                
            gencode[geneId][2] = geneName
            
fh = open('gencode/gencode.v27-20-19.comparison.txt', 'wt')

for ke, val in gencode.items():
    fh.write(ke+'\t'+'\t'.join(val)+'\n')
    
fh.close()

In [12]:
i=0
for ke ,val in gencode.items():
    if val[0] != val[1]:
        if val[2]=='NA' or val[1]=='NA':
            continue
        i+=1
        print (ke+'\t'+'\t'.join(val))

ENSG00000243485	MIR1302-2HG	MIR1302-11	MIR1302-11
ENSG00000268020	AL627309.6	OR4G4P	OR4G4P
ENSG00000238009	AL627309.1	RP11-34P13.7	RP11-34P13.7
ENSG00000239945	AL627309.3	RP11-34P13.8	RP11-34P13.8
ENSG00000268903	AL627309.7	RP11-34P13.15	RP11-34P13.15
ENSG00000269981	AL627309.8	RP11-34P13.16	RP11-34P13.16
ENSG00000239906	AL627309.2	RP11-34P13.14	RP11-34P13.14
ENSG00000241860	AL627309.5	RP11-34P13.13	RP11-34P13.13
ENSG00000241599	AL627309.4	RP11-34P13.9	RP11-34P13.9
ENSG00000228463	AP006222.1	AP006222.2	AP006222.2
ENSG00000236679	RPL23AP24	RP4-669L17.1	RP4-669L17.1
ENSG00000236601	AL732372.1	RP4-669L17.2	RP4-669L17.2
ENSG00000237094	AL732372.2	RP4-669L17.10	RP4-669L17.10
ENSG00000250575	AL732372.3	RP4-669L17.8	RP4-669L17.8
ENSG00000230021	AL669831.3	RP5-857K21.4	RP5-857K21.4
ENSG00000235146	AC114498.1	RP5-857K21.2	RP5-857K21.2
ENSG00000237973	MTCO1P12	MIR6723	hsa-mir-6723
ENSG00000229344	MTCO2P12	RP5-857K21.7	RP5-857K21.7
ENSG00000198744	MTCO3P12	RP5-857K21.11	RP5-857K21.11
ENSG00000228

In [13]:
i

21363