# Find and create feature list 

In [1]:
import pandas as pd

In [2]:
# Open large combined feature matrix
ft_df = pd.read_csv('../../../../08_manuscript/featureSetML_TCGA/src/classifier_metrics_20210821/collected_features_matrix.tsv', sep='\t', low_memory=False)

In [3]:
# KIRCKICH,LGGGBM,LIHCCHOL does not have MIR only model
# MESO,OV,PAAD does not have MUTA model
cancer_list = [
    'ACC', 'BRCA', 'BLCA', 'CESC', 
    'COADREAD', 'ESCC', 'GEA', 'HNSC', 
    'KIRCKICH', 'KIRP', 'LGGGBM', 'LIHCCHOL', 
    'LUAD', 'LUSC', 'MESO', 'OV', 
    'PAAD', 'PCPG', 'PRAD', 'SARC', 
    'SKCM', 'TGCT', 'THCA', 'THYM', 
    'UCEC', 'UVM'
]

for cancer in cancer_list:

    # Find top model name and associated ftset name
    for platform in ['OVERALL','CNVR', 'GEXP', 'METH', 'MIR', 'MUTA']:
        
        # Not all cancers have all platforms - skip if no model
        if cancer in ['KIRCKICH', 'LGGGBM', 'LIHCCHOL'] and platform == 'MIR':
            print('...skipping {} for {} because no model ran'.format(cancer, platform))
            continue
        elif cancer in ['MESO', 'OV', 'PAAD'] and platform == 'MUTA':
            print('...skipping {} for {} because no model ran'.format(cancer, platform))
            continue
        else:
            df = pd.read_csv('./skgrid_best_models_{}_2022-04-05_v1.tsv'.format(platform), sep='\t')
            print(platform)
            top = df[df['Cohort']==cancer].reset_index()
            model = top['Model'][0]
            ftid = top['Features'][0]

            print(model)
            print(ftid)

            # Save ft names as a file
            output = '../data/src/training_data/{}_{}_featurelist.txt'.format(cancer, platform)
            with open(output, 'w') as out:
                s1 = ft_df[['featureID', ftid+'_'+cancer]]
                s1 = s1.iloc[3:,]
                s1 = s1.loc[(s1=='1').any(axis=1)]

                for a in s1['featureID']:
                    out.write(a +'\n')

OVERALL
ExtraTrees(criterion=gini,n_estimators=128)
skgrid_ACC_fbedeBIC_perplatformALL
CNVR
SGD(alpha=0.01,loss=squared_hinge,penalty=l1)
skgrid_ACC_rfe15_perplatformCNVR
GEXP
LogisticRegression(C=0.01,max_iter=500,solver=lbfgs)
skgrid_ACC_fbedeBIC_perplatformGEXP
METH
GaussianNB(var_smoothing=0.0001)
skgrid_ACC_fbedeBIC_perplatformMETH
MIR
LogisticRegression(C=0.01,max_iter=500,solver=lbfgs)
skgrid_ACC_fbedeBIC_perplatformMIR
MUTA
AdaBoost(learning_rate=0.01,n_estimators=1000)
skgrid_ACC_fbedeBIC_perplatformMUTA
OVERALL
RandomForest(criterion=entropy,n_estimators=200)
skgrid_BRCA_fbedeBIC_combined
CNVR
LogisticRegression(C=1.0,max_iter=500,solver=lbfgs)
skgrid_BRCA_fbedeBIC_perplatformCNVR
GEXP
RandomForest(criterion=entropy,n_estimators=200)
skgrid_BRCA_fbedeBIC_perplatformGEXP
METH
RandomForest(criterion=gini,n_estimators=200)
skgrid_BRCA_rfe15_perplatformMETH
MIR
LogisticRegression(C=100,max_iter=500,solver=newton-cg)
skgrid_BRCA_fbedeBIC_perplatformMIR
MUTA
BernoulliNB(alpha=0.3)


MIR
LogisticRegression(C=100,max_iter=500,solver=liblinear)
skgrid_OV_fbedeBIC_perplatformMIR
...skipping OV for MUTA because no model ran
OVERALL
LogisticRegression(C=100,max_iter=500,solver=lbfgs)
skgrid_PAAD_fbedeBIC_combined
CNVR
SGD(alpha=0.0001,loss=modified_huber,penalty=l1)
skgrid_PAAD_fbedeBIC_perplatformCNVR
GEXP
LogisticRegression(C=100,max_iter=500,solver=lbfgs)
skgrid_PAAD_fbedeBIC_perplatformGEXP
METH
SGD(alpha=0.0001,loss=log,penalty=l1)
skgrid_PAAD_rfe15_perplatformMETH
MIR
SVC(C=0.2,kernel=linear)
skgrid_PAAD_rfe15_perplatformMIR
...skipping PAAD for MUTA because no model ran
OVERALL
RandomForest(criterion=gini,n_estimators=120)
skgrid_PCPG_fbedeBIC_perplatformALL
CNVR
SVC(C=2,kernel=poly)
skgrid_PCPG_fbedeBIC_perplatformCNVR
GEXP
LogisticRegression(C=0.1,max_iter=500,solver=newton-cg)
skgrid_PCPG_fbedeBIC_perplatformGEXP
METH
SVC(C=2,kernel=rbf)
skgrid_PCPG_rfe15_perplatformMETH
MIR
LogisticRegression(C=1.0,max_iter=500,solver=lbfgs)
skgrid_PCPG_fbedeBIC_perplatformMI