In [None]:
from wmbio import * 

## **Preprocessing**

### **File Path**

In [None]:
os.chdir("..")
os.getcwd()

In [None]:
CANCER_TYPE = "LIHC"
RAW_file_path = os.getcwd() + "/RAW_DATA/"
PKL_file_path = os.getcwd() + "/pkl/"
MODEL_PATH = os.getcwd() + "/models/"
TENSORBOARD_PATH = os.getcwd() + '/log'
GROUP_PHTH = os.getcwd() + '/group/'
PNG_PATH = os.getcwd() + '/png/'
GROUP_VALIDATION_PATH = os.getcwd() + '/group_validation/'
DEG_PATH = os.getcwd() + "/deg/"

* **Data-Load**

In [None]:
omics = load_tcga_dataset(pkl_path=PKL_file_path, raw_path=RAW_file_path, cancer_type=CANCER_TYPE, norm=True)
X_train, X_test = train_test_split(omics, test_size = .2, random_state = 21, shuffle=True)

## **Best Auto-Encoder & K-Mean Clustering**

In [None]:
for _ in range(1):
    log_pvalue_l, silhouette_score_l, rna_anovar_f1, rna_rf_f1 = [], [], [], []
    mirna_anovar_f1, mirna_rf_f1, mt_anovar_f1, mt_rf_f1 = [], [], [], []
    file_name = []
    
    FILE_NAME = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    SAMPLE_GROUP = GROUP_PHTH + CANCER_TYPE + "_GROUP_" + FILE_NAME + ".txt"
    file_name.append(FILE_NAME)
    print(FILE_NAME)
    
    ## AE(vanilla, sparse, denoisy) - Model compile & Fit
    encoder_vanilla = run_ae(X_train=X_train, X_test=X_test, tensorboard_path=TENSORBOARD_PATH)
    encoder_sparse = run_ae_sparse(X_train=X_train, X_test=X_test, tensorboard_path=TENSORBOARD_PATH)
    encoder_denoisy = run_ae_denoisy(X_train=X_train, X_test=X_test, tensorboard_path=TENSORBOARD_PATH)

    group, silhouette_score = best_ae_model(model_list=[encoder_vanilla, encoder_sparse, encoder_denoisy], o=omics,
                  group_path=GROUP_PHTH, model_path=MODEL_PATH, cancer_type=CANCER_TYPE, file_name=FILE_NAME)

    ## Sub-group Evalutation
    ### load preprocess data
    omics_preprocess = load_preprocess_tcga_dataset(pkl_path=PKL_file_path, raw_path=RAW_file_path, group=group, norm=True, 
                                                   cancer_type="LIHC")                                         

    ### Feature Selection(Anova, RandomForest) for SVM
    feature_result = feature_selection_svm(data_type=["rna", "mirna", "mt"], o=omics_preprocess)

    ### Survival Analysis - logranktest
    log_pvalue = log_rank_test(df=omics_preprocess["omics"], png_path=PNG_PATH, file_name=FILE_NAME)

    ### Score
    log_pvalue_l.append(log_pvalue)
    silhouette_score_l.append(silhouette_score)

    rna_anovar_f1.append(feature_result["rna"][0][2])
    rna_rf_f1.append(feature_result["rna"][1][2])

    mirna_anovar_f1.append(feature_result["mirna"][0][2])
    mirna_rf_f1.append(feature_result["mirna"][1][2])

    mt_anovar_f1.append(feature_result["mt"][0][2])
    mt_rf_f1.append(feature_result["mt"][1][2])
    
    # session clear
    gc.collect()
    
    # Write Score DF
    score_df = pd.DataFrame({
    'FILENAME' : file_name,
    'Log Rank Test' : log_pvalue_l,
    'Silhouette' : silhouette_score_l,
    'RNA_ANOVA_F1' : rna_anovar_f1,
    'RNA_RF_F1' : rna_rf_f1,
    'miRNA_ANOVA_F1' : mirna_anovar_f1,
    'miRNA_RF_F1' : mirna_rf_f1,
    'Methylation_ANOVA_F1' : mt_anovar_f1,
    'Methylation_RF_F1' : mt_rf_f1
    })

    # score table
    if not os.path.exists(GROUP_VALIDATION_PATH):
        Path(GROUP_VALIDATION_PATH).mkdir(parents=True, exist_ok=True)
        score_df.to_csv(GROUP_VALIDATION_PATH + "test_validation.csv", index=False, mode='w')
    else:
        score_df.to_csv(GROUP_VALIDATION_PATH + "test_validation.csv", index=False, mode='a', header=False)
    
    # DEG Extraction
    deg_list = deg_extract(log_fc=1.5, fdr=0.05,
                   cancer_type=CANCER_TYPE, 
                   sample_group=SAMPLE_GROUP, deg_path=DEG_PATH, 
                   file_name=FILE_NAME)