In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import os
from tqdm.notebook import tqdm
import ml_utils
import pickle
from sklearn.ensemble import RandomForestClassifier

In [3]:
selected_gene_types = [
    "lncRNA",
    "TEC",
    "protein_coding",
    "transcribed_unitary_pseudogene",
    "transcribed_unprocessed_pseudogene",
    "transcribed_processed_pseudogene",
    "miRNA",
]

In [4]:
df_master=None
for gene_type in tqdm(selected_gene_types):
    fpath = os.path.join("..","output",gene_type,"dataset.in.csv")
    df = pd.read_csv(fpath)
    if df_master is None:
        df_master = df
    else:
        df_master = df_master.merge(df, on=['uuid', 'dataset', 'cancer_type'], how='outer')
df_master.fillna(0, inplace=True)
df_master=df_master.sample(frac=1).reset_index(drop=True)

  0%|          | 0/7 [00:00<?, ?it/s]

In [5]:
df_master

Unnamed: 0,cancer_type,uuid,A1BG-AS1,A2M-AS1,A2ML1-AS1,A2ML1-AS2,AA06,AADACL2-AS1,AATBC,ABALON,...,MIRLET7B,MIRLET7C,MIRLET7D,MIRLET7E,MIRLET7F1,MIRLET7F2,MIRLET7G,MIRLET7I,SNORD138,Z97988.1
0,colon,91a95c38-5ab7-4166-b4e7-64454cb3d06c,0.0783,0.1344,0.0000,0.0000,0.0000,0.2080,0.2168,0.0000,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0
1,colon,56c23214-8903-47cc-8290-e8bd59f24d11,0.3757,0.9890,0.0000,0.0000,0.0000,0.0000,0.6762,0.0000,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0
2,prostate-gland,d3819816-a957-49a8-923c-a52825a5ad61,0.2808,1.6070,0.0000,0.0000,0.0000,0.0000,3.2076,0.0000,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0
3,thyroid_gland,a06414d2-981e-4ddb-b34b-af5d504d606f,2.2755,0.6010,0.0000,0.0000,0.0000,0.0000,0.3194,0.0000,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.0000,4.4740,0.0,0.0
4,kidney,cd6333d6-6973-4278-8c7d-ad89e71bd508,0.0962,2.4859,0.0000,0.0000,0.0000,0.0000,0.5995,0.0487,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.0000,1.1043,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8793,breast,0d669e80-cb31-4afb-bd3f-07d69f10eb3f,1.4448,1.0461,0.0000,0.0000,0.3148,0.3133,0.2041,0.0000,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0
8794,corpus_uteri,8fe816e4-57fc-4aac-a726-f0172f72d206,4.5223,0.6319,0.0000,0.0000,0.0000,0.4191,28.9393,0.0000,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0
8795,corpus_uteri,53f894dc-239b-47f9-9211-055d867791c8,0.4976,0.9018,0.0000,0.0000,0.0000,0.0000,3.2153,0.0000,...,0.0,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0
8796,kidney,b1c42fbb-018d-4267-bb1e-4b34470f84d5,0.1208,3.1944,0.0000,0.1056,0.0000,0.0000,0.1673,0.0000,...,0.0,0.4853,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0


In [6]:
import gc
del df 
gc.collect()

17

In [7]:
clf_class=RandomForestClassifier
clf_name="RandomForestClassifier"
for nc in tqdm([10*i for i in range(6,7)]):
    pca = ml_utils.build_pca(df_master, n_components=nc)
    df_pca = pd.DataFrame(pca.transform(df_master.drop(columns=['cancer_type', 'uuid', 'dataset'])))
    pca_columns = [f'pca_{i}' for i, _ in enumerate(df_pca.columns)]
    df_pca.columns = pca_columns
    df_pca = df_pca.assign(uuid=df_master.uuid, cancer_type=df_master.cancer_type, dataset=df_master.dataset)
    df_pca = df_pca[['uuid', 'cancer_type', 'dataset'] + pca_columns]
    mm = ml_utils.build_minmax_scaler(df_pca)
    model = ml_utils.build_classifier(df_master, pca, mm, clf_class=clf_class, clf_kws={'max_depth':15, 'n_estimators':100})
    df_out = ml_utils.add_predictions(df_master, pca, model, mm)
    df_pca.to_csv(f"../output/FINAL/dataset.pca.{nc}.csv", index=False)
    df_out[['uuid', 'dataset', 'cancer_type','logreg_prediction']].to_csv(f"../output/FINAL/dataset.out.{clf_name}.{nc}.csv", index=False)    
    with open(f"../output/FINAL/pca.{nc}.pickle", "wb") as f:
        pickle.dump(pca, f)
    with open(f"../output/FINAL/mm.{nc}.pickle", "wb") as f:
        pickle.dump(mm, f)
    with open(f"../output/FINAL/{clf_name}.{nc}.pickle", "wb") as f:
        pickle.dump(model, f)

    del df_pca, df_out, model, pca
    gc.collect()

  0%|          | 0/1 [00:00<?, ?it/s]



In [8]:
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
accuracies = []
for nc in [10*i for i in range(6,7)]:
    df = pd.read_csv(f"../output/FINAL/dataset.out.{clf_name}.{nc}.csv")
    
    acc_validation = accuracy_score(
        df.query('dataset=="Validation"').cancer_type,
        df.query('dataset=="Validation"').logreg_prediction,
    )
    
    acc_train = accuracy_score(
        df.query('dataset=="Train"').cancer_type,
        df.query('dataset=="Train"').logreg_prediction,
    )
    
    accuracies.append({
        'n_components' : nc,
        'dataset' : 'Validation',
        'accuracy' : acc_validation
    })
    
    accuracies.append({
        'n_components' : nc,
        'dataset' : 'Train',
        'accuracy' : acc_train
    })
    
accuracies = pd.DataFrame.from_records(accuracies)
accuracies

Unnamed: 0,n_components,dataset,accuracy
0,60,Validation,0.911364
1,60,Train,0.999432


In [10]:
with open("../output/FINAL/logreg.60.pickle", "rb") as f:
    test_model = pickle.load(f)
with open("../output/FINAL/mm.60.pickle", "rb") as f:
    test_minmax = pickle.load(f)
test_model, test_minmax

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


(LogisticRegression(max_iter=10000), MinMaxScaler(feature_range=(-5, 5)))

In [11]:
test_df = pd.read_csv("../output/FINAL/dataset.pca.60.csv")
test_df

Unnamed: 0,uuid,cancer_type,dataset,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,...,pca_50,pca_51,pca_52,pca_53,pca_54,pca_55,pca_56,pca_57,pca_58,pca_59
0,91a95c38-5ab7-4166-b4e7-64454cb3d06c,colon,Train,11116.327849,-4535.312618,3282.756671,11320.706363,-7640.183424,-8659.425454,-2576.237867,...,938.809618,-498.070822,1102.941365,1360.931199,-331.004839,2027.541847,1329.466185,-479.650224,-166.312040,539.906311
1,56c23214-8903-47cc-8290-e8bd59f24d11,colon,Train,-47596.905649,-5125.847259,7967.496136,12541.003674,-4253.270128,1751.043937,4284.585998,...,3257.823471,521.177226,-4093.034726,950.633369,-576.036237,-4934.753290,1102.578919,-1351.500471,-40.892105,3485.929594
2,d3819816-a957-49a8-923c-a52825a5ad61,prostate-gland,Train,-36409.560811,-4682.121549,-4617.529361,7282.824417,179.573405,2115.676074,-7436.772711,...,-1756.581667,-2116.683736,2299.719520,183.952496,5254.732802,-4980.041926,-1407.080391,2445.488887,-1882.822810,781.783152
3,a06414d2-981e-4ddb-b34b-af5d504d606f,thyroid_gland,Validation,14688.574658,-5997.755157,-4899.433783,98.238342,6353.374971,5396.243363,1838.561970,...,-2285.261931,-941.302568,1255.983144,-354.867747,-5.162154,1122.728819,-596.203922,-154.534465,-1568.706151,1036.847187
4,cd6333d6-6973-4278-8c7d-ad89e71bd508,kidney,Validation,12610.900990,-2697.536268,-2671.663354,-11935.833585,3645.348712,3367.306147,930.952027,...,-2501.723509,1078.666155,1047.283555,1071.225379,-1124.024892,1484.707636,-429.815363,-251.766988,-449.403561,-859.590556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8793,0d669e80-cb31-4afb-bd3f-07d69f10eb3f,breast,Train,-26759.813847,-2405.437925,245.561521,5544.955488,5098.902771,514.537776,-3288.602580,...,138.113918,32.836971,-43.641504,-475.378371,-1045.067057,-564.384808,-401.850596,-510.859170,398.411601,-479.860530
8794,8fe816e4-57fc-4aac-a726-f0172f72d206,corpus_uteri,Train,-19856.532171,-5811.758787,-5714.779229,-3051.171149,-6884.176509,4692.817605,2094.856011,...,-1135.778967,-599.920993,693.437418,-1121.018778,1342.580003,-441.453113,879.070270,811.409138,-633.534859,220.992428
8795,53f894dc-239b-47f9-9211-055d867791c8,corpus_uteri,Test,-29516.618731,-6408.506983,-6187.196030,36.577665,-86.590487,2110.959668,2496.585147,...,-2657.437940,-1296.022638,-5.449439,-1508.718062,-1698.758341,-1715.467508,-358.338625,949.617387,-154.513844,-990.386561
8796,b1c42fbb-018d-4267-bb1e-4b34470f84d5,kidney,Validation,66338.232480,43.657791,6767.924438,-4610.005973,20207.487683,14985.846982,7403.806749,...,1519.464298,-620.475941,810.267244,275.707107,-331.396013,-329.678606,-78.566328,-388.350833,-557.119852,-872.583494


In [12]:
test_df__ = test_df.assign(preds=test_model.predict(test_minmax.transform(test_df.drop(columns=['uuid','cancer_type','dataset']))))
test_df__

Unnamed: 0,uuid,cancer_type,dataset,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,...,pca_51,pca_52,pca_53,pca_54,pca_55,pca_56,pca_57,pca_58,pca_59,preds
0,91a95c38-5ab7-4166-b4e7-64454cb3d06c,colon,Train,11116.327849,-4535.312618,3282.756671,11320.706363,-7640.183424,-8659.425454,-2576.237867,...,-498.070822,1102.941365,1360.931199,-331.004839,2027.541847,1329.466185,-479.650224,-166.312040,539.906311,colon
1,56c23214-8903-47cc-8290-e8bd59f24d11,colon,Train,-47596.905649,-5125.847259,7967.496136,12541.003674,-4253.270128,1751.043937,4284.585998,...,521.177226,-4093.034726,950.633369,-576.036237,-4934.753290,1102.578919,-1351.500471,-40.892105,3485.929594,colon
2,d3819816-a957-49a8-923c-a52825a5ad61,prostate-gland,Train,-36409.560811,-4682.121549,-4617.529361,7282.824417,179.573405,2115.676074,-7436.772711,...,-2116.683736,2299.719520,183.952496,5254.732802,-4980.041926,-1407.080391,2445.488887,-1882.822810,781.783152,breast
3,a06414d2-981e-4ddb-b34b-af5d504d606f,thyroid_gland,Validation,14688.574658,-5997.755157,-4899.433783,98.238342,6353.374971,5396.243363,1838.561970,...,-941.302568,1255.983144,-354.867747,-5.162154,1122.728819,-596.203922,-154.534465,-1568.706151,1036.847187,breast
4,cd6333d6-6973-4278-8c7d-ad89e71bd508,kidney,Validation,12610.900990,-2697.536268,-2671.663354,-11935.833585,3645.348712,3367.306147,930.952027,...,1078.666155,1047.283555,1071.225379,-1124.024892,1484.707636,-429.815363,-251.766988,-449.403561,-859.590556,breast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8793,0d669e80-cb31-4afb-bd3f-07d69f10eb3f,breast,Train,-26759.813847,-2405.437925,245.561521,5544.955488,5098.902771,514.537776,-3288.602580,...,32.836971,-43.641504,-475.378371,-1045.067057,-564.384808,-401.850596,-510.859170,398.411601,-479.860530,breast
8794,8fe816e4-57fc-4aac-a726-f0172f72d206,corpus_uteri,Train,-19856.532171,-5811.758787,-5714.779229,-3051.171149,-6884.176509,4692.817605,2094.856011,...,-599.920993,693.437418,-1121.018778,1342.580003,-441.453113,879.070270,811.409138,-633.534859,220.992428,breast
8795,53f894dc-239b-47f9-9211-055d867791c8,corpus_uteri,Test,-29516.618731,-6408.506983,-6187.196030,36.577665,-86.590487,2110.959668,2496.585147,...,-1296.022638,-5.449439,-1508.718062,-1698.758341,-1715.467508,-358.338625,949.617387,-154.513844,-990.386561,breast
8796,b1c42fbb-018d-4267-bb1e-4b34470f84d5,kidney,Validation,66338.232480,43.657791,6767.924438,-4610.005973,20207.487683,14985.846982,7403.806749,...,-620.475941,810.267244,275.707107,-331.396013,-329.678606,-78.566328,-388.350833,-557.119852,-872.583494,breast


In [13]:
accuracy_score(
    test_df__.query('dataset=="Train"').cancer_type,
    test_df__.query('dataset=="Train"').preds,
)

0.3329070758738278

In [14]:
test_df__xx = pd.read_csv("../output/FINAL/dataset.out.RandomForestClassifier.60.csv")
test_df__xx

Unnamed: 0,uuid,dataset,cancer_type,logreg_prediction
0,91a95c38-5ab7-4166-b4e7-64454cb3d06c,Train,colon,colon
1,56c23214-8903-47cc-8290-e8bd59f24d11,Train,colon,colon
2,d3819816-a957-49a8-923c-a52825a5ad61,Train,prostate-gland,prostate-gland
3,a06414d2-981e-4ddb-b34b-af5d504d606f,Validation,thyroid_gland,thyroid_gland
4,cd6333d6-6973-4278-8c7d-ad89e71bd508,Validation,kidney,kidney
...,...,...,...,...
8793,0d669e80-cb31-4afb-bd3f-07d69f10eb3f,Train,breast,breast
8794,8fe816e4-57fc-4aac-a726-f0172f72d206,Train,corpus_uteri,corpus_uteri
8795,53f894dc-239b-47f9-9211-055d867791c8,Test,corpus_uteri,corpus_uteri
8796,b1c42fbb-018d-4267-bb1e-4b34470f84d5,Validation,kidney,kidney


In [15]:
test_df__xx[['uuid','cancer_type','logreg_prediction']].merge(test_df__, on='uuid').drop(columns=[col for col in test_df__.columns if col.startswith('pca')])[['logreg_prediction', 'preds']].drop_duplicates().sort_values(by='preds').to_csv("/mnt/c/Users/azbal/Desktop/investigate.csv")

OSError: Cannot save file into a non-existent directory: '\mnt\c\Users\azbal\Desktop'

In [None]:
!echo $WINDOWSHOME

In [None]:
sns.lineplot(
    data=accuracies,
    x = 'n_components',
    y = 'accuracy',
    hue='dataset',
    linewidth=2,
    marker="D",
    markersize=5
)
plt.grid()

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
df = pd.read_csv(f"../output/FINAL/dataset.out.20.csv")

In [None]:
pd.DataFrame(
    confusion_matrix(
        df.query('dataset=="Validation"').logreg_prediction,
        df.query('dataset=="Validation"').cancer_type
    )
)

In [None]:
xx = pd.read_csv("../data/train_val_test_split.csv")
xx

In [None]:
xxx = xx.groupby('cancer_type').dataset.value_counts().reset_index()
xxx

In [None]:
with plt.rc_context(rc={'font.size':15, 'font.weight':900}):
    plt.figure(figsize=[15,15])
    plt.pie(
        xxx.query('dataset=="Test"')['count'],
        labels=xxx.query('dataset=="Test"')['cancer_type'],
        autopct="%.2f%%",
        rotatelabels=True,
        pctdistance=0.8
    )
    None

In [None]:
len(test_df)

In [None]:
test_df.groupby(['cancer_type','dataset']).uuid.nunique().reset_index().pivot_table(index='cancer_type', columns='dataset', values='uuid').astype(int)

In [None]:
gene_types = list(pd.read_csv("../data/gene_types.txt").GENE_TYPE)

In [None]:
num_genes = 0
for g in gene_types:
    num_genes+=len(pd.read_csv(f"../output/{g}/dataset.in.csv", nrows=5).drop(columns=['dataset', 'uuid', 'cancer_type']).columns)
num_genes