In [1]:
# -*- coding: UTF-8 -*-
import os, re, sys
import pandas as pd
import numpy as np

import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif as MIC
from sklearn.feature_selection import mutual_info_regression as MIR
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder

from utils import filter_feature_selection

workdir = "./"
indir = "./data"
outdir = "./outdir/feature selection"
tail = '.tcga_gtex.tpm.updown.csv'

datasets_summary = {}
for pardir, subdirs, curfiles in os.walk(indir):
    for subfile in curfiles:
        if re.search(tail, subfile):
            subfile_abspath = pardir + "/" + subfile

            tumor_type = subfile.split(".")[0]
            if not os.path.exists(os.path.join(outdir, tumor_type)):
                os.makedirs(os.path.join(outdir, tumor_type))

            tpm_df = pd.read_csv(subfile_abspath)
            tpm_df.dropna(how="any", axis=0, inplace=True)
            tpm_df.index = tpm_df['Gene'].values
            tpm_df.drop(['Gene'], axis=1, inplace=True)

            clinical_df = pd.read_csv(os.path.join(indir, tumor_type, tumor_type + ".clinical.csv"))
            clinical_df.index = clinical_df["sample_name"].values
            # clinical_df.drop(["sample_source"], axis=1, inplace=True)#"sample_name",

            samples = list(set(tpm_df.columns.tolist()) & set(clinical_df.index.tolist()))
            tpm_df = tpm_df[samples]
            clinical_df = clinical_df.loc[samples]

            from sklearn.preprocessing import LabelEncoder

            y = LabelEncoder().fit_transform(clinical_df.loc[:, "sample_type"])

            tpm_df.drop(tpm_df.index[(tpm_df > 0).sum(axis=1) / tpm_df.shape[1] < 0.5], axis=0, inplace=True)

            from sklearn.feature_selection import VarianceThreshold
            var_selector = VarianceThreshold(threshold=0).fit(tpm_df.T)
            tpm_df.drop(tpm_df.index[~var_selector.get_support()], axis=0,inplace=True)

            tpm_df = filter_feature_selection(tpm_df, y, label_type='classif', methods='mutual_info')
            tpm_df.to_csv(os.path.join(outdir, tumor_type, os.path.splitext(subfile)[0] + ".feature_selection.csv"),
                          sep=',', index=True)

            datasets_summary[tumor_type] = tpm_df.shape

datasets_summary = pd.DataFrame(datasets_summary, index=["gene", "sample"])
datasets_summary.to_csv(os.path.join(outdir, "datasets.summary.csv"), sep=',', index=True)
print("DONE!")


DONE!
