# Feature selection

In [1]:
import pandas as pd
import numpy as np
import torch
import os
os.chdir('/home/jiageng/Documents/fhr/pipeline/')
import snf
from sklearn.feature_selection import *

In [2]:
fhr_data = pd.read_csv('../annotations/fhr-annotations.tsv',sep='\t').set_index('PUBLIC_ID').query('risk != -1')
fhr_data['risk'] += 1

In [63]:
subtype = 'mirna' # gene (proteincoding), mirna, lncrna

In [64]:
# subset to samples with fhr labels
if subtype == 'gene':
    # protein coding genes
    rnamat = pd.read_csv('../matrices/gene_exp_matrix.tsv',sep='\t').set_index('PUBLIC_ID').drop(columns=['SAMPLE'])
    ensgid = pd.read_csv('../annotations/ensgid-autosomal-proteincoding.txt',sep='\t').set_index('Gene stable ID')
elif subtype == 'mirna':
    # miRNA
    rnamat = pd.read_csv('../matrices/mirna_exp_matrix.tsv',sep='\t').set_index('PUBLIC_ID').drop(columns=['SAMPLE'])
    ensgid = pd.read_csv('/home/jiageng/Documents/fhr/annotations/ensgid-autosomal-mirna.txt', sep='\t').set_index('Gene stable ID')
elif subtype == 'lncrna':
    # lncRNA
    rnamat = pd.read_csv('../matrices/lncrna_exp_matrix.tsv',sep='\t').set_index('PUBLIC_ID').drop(columns=['SAMPLE'])
    ensgid = pd.read_csv('/home/jiageng/Documents/fhr/annotations/ensgid-autosomal-lncrna.txt', sep='\t').set_index('Gene stable ID')
else:
    raise('Unknown subtype:', subtype)

print(subtype, rnamat.shape)

mirna (806, 553)


In [43]:
public_ids = list(set(fhr_data.index).intersection(set(rnamat.index)))
print(len(public_ids))

701


In [5]:
def stdNormalize(df):
    numeric_df = df.select_dtypes(include=[np.number])
    std = numeric_df.std().fillna(1)
    mean = numeric_df - numeric_df.mean()
    numeric_df_norm = mean / std
    nonnumeric_df = df.select_dtypes(exclude=[np.number])
    df_norm = pd.concat([nonnumeric_df, numeric_df_norm], axis=1)
    return df_norm

In [65]:
X = stdNormalize(rnamat).loc[public_ids]
y = fhr_data.loc[public_ids,'risk']

Feature selection by FWE significance of 0.05

miRNA gives only 3 genes. Too stringent

In [60]:
fwe = SelectFwe(f_classif, alpha=0.05).fit(X, y)
print(len(fwe.get_feature_names_out()))
ensgid.loc[fwe.get_feature_names_out()]

3


Unnamed: 0_level_0,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Gene name
Gene stable ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000207744,2,176150303,176150412,MIR10B
ENSG00000264349,1,154975693,154975783,MIR4258
ENSG00000284419,20,26208186,26208278,MIR663A


In [61]:
rnamat[fwe.get_feature_names_out()].to_csv(f'../matrices/{subtype}_exp_matrix_fwe.tsv',sep='\t')

Feature selection by specifying FDR of 0.05

In [66]:
fdr = SelectFdr(f_classif, alpha=0.05).fit(X, y)
print(len(fdr.get_feature_names_out()))
ensgid.loc[fdr.get_feature_names_out()]

4


Unnamed: 0_level_0,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Gene name
Gene stable ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000207744,2,176150303,176150412,MIR10B
ENSG00000264349,1,154975693,154975783,MIR4258
ENSG00000274466,16,24203116,24203231,MIR1273H
ENSG00000284419,20,26208186,26208278,MIR663A


In [67]:
rnamat[fdr.get_feature_names_out()].to_csv(f'../matrices/{subtype}_exp_matrix_fdr.tsv',sep='\t')

Feature selection by specifying FPR of 0.05

In [68]:
fpr = SelectFpr(f_classif, alpha=0.05).fit(X, y)
print(len(fpr.get_feature_names_out()))
ensgid.loc[fpr.get_feature_names_out()]

42


Unnamed: 0_level_0,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Gene name
Gene stable ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000199023,7,1022933,1023026,MIR339
ENSG00000199082,14,100109655,100109753,MIR342
ENSG00000199090,11,75335092,75335186,MIR326
ENSG00000207590,1,220117853,220117962,MIR215
ENSG00000207626,2,232172653,232172747,MIR562
ENSG00000207696,22,37847678,37847774,MIR659
ENSG00000207744,2,176150303,176150412,MIR10B
ENSG00000207821,19,19435063,19435158,MIR640
ENSG00000207874,11,28056815,28056910,MIR610
ENSG00000207974,1,168375524,168375621,MIR557


In [69]:
rnamat[fpr.get_feature_names_out()].to_csv(f'../matrices/{subtype}_exp_matrix_fpr.tsv',sep='\t')

Feature selection by K best method

In [72]:
k=20
kbest = SelectKBest(f_classif, k=k).fit(X,y)
rnamat[kbest.get_feature_names_out()].to_csv(f'../matrices/{subtype}_exp_matrix_k{k}.tsv',sep='\t')