In [1]:
import pandas as pd
pd.options.display.max_columns = 50
import numpy as np
import sys
import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score

from train_utils import my_GridSearchCV
sys.path.append('/cellar/users/y8qin/Data/my_utils/')
from file_utils import *
import slurm_utils as slurm

%load_ext autoreload
%autoreload 2

In [2]:
lung = load_obj('/cellar/users/y8qin/Data/image_hackathon/morpho-type/Yue/lung.pkl')
train_lung = lung[lung['sample'].isin([1,2])]
test_lung = lung[lung['sample'] == 3]

In [3]:
train_lung.head(2)

Unnamed: 0,CellID,Label,Immune,Stroma,Tumor,Area,Eccentricity,Solidity,Extent,EulerNumber,Perimeter,MajorAxisLength,MinorAxisLength,Orientation,Neighbor_1,Neighbor_2,Neighbor_3,Neighbor_4,Neighbor_5,X_position,Y_position,sample
0,1,Stroma,0.084226,0.913542,0.002232,236,0.957049,0.810997,0.428312,1,78.7,34.044535,9.870373,-29.378275,2,570,598,643,654,13.957627,467.186441,1
1,2,Stroma,0.279038,0.719192,0.00177,201,0.638133,0.922018,0.744444,1,53.45,18.572493,14.299456,-46.545586,1,3,4,226,227,7.666667,525.970149,1


In [4]:
morpho_feat = ['Area', 
               'Eccentricity', 
               'Solidity', 
               'Extent',  
               'Perimeter', 
               'MajorAxisLength', 
               'MinorAxisLength']

In [5]:
train_X = train_lung[morpho_feat].values
test_X = test_lung[morpho_feat].values

label_map = {'Stroma':0, 'Immune':1, 'Tumor':2}
train_y = train_lung['Label'].map(label_map).values
test_y = test_lung['Label'].map(label_map).values

In [6]:
# Random Forest
rf_kwargs = {
    'n_estimators': 200,
    'class_weight': 'balanced', 
    'n_jobs':8}
rf_tuning = {
    'min_samples_split': [.05, .1, .2],
    'max_features': ['sqrt', .5, .75]}
rf = Pipeline(
    [('classification', my_GridSearchCV(
         RandomForestClassifier,
         rf_kwargs, rf_tuning, scoring='f1_macro'))])

In [8]:
rf.fit(train_X[:200], train_y[:200])

Pipeline(memory=None,
     steps=[('classification', GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_..._dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_macro', verbose=0))])

In [9]:
pred = rf.predict(test_X)
pred_proba = rf.predict_proba(test_X)

In [10]:
accuracy_score(test_y, pred)

0.4697759374193117

In [18]:
label_idx_map = {0:'Stroma', 1:'Immune', 2:'Tumor'}
result = pd.DataFrame(pred_proba, columns=['Stroma_proba', 'Immune_proba', 'Tumor_proba'])
result['CellID'] = test_lung['CellID'].values
result['Pred'] = [label_idx_map[x] for x in pred]
column_order = ['CellID', 'Pred', 'Stroma_proba', 'Immune_proba', 'Tumor_proba']
result = result[column_order]

In [20]:
test_lung

Unnamed: 0,CellID,Label,Immune,Stroma,Tumor,Area,Eccentricity,Solidity,Extent,EulerNumber,Perimeter,MajorAxisLength,MinorAxisLength,Orientation,Neighbor_1,Neighbor_2,Neighbor_3,Neighbor_4,Neighbor_5,X_position,Y_position,sample
0,1,Stroma,0.043140,9.568500e-01,9.885892e-06,302,0.919386,0.909639,0.548094,1,77.326,31.589485,12.425949,23.836682,82,117,157,199,0,15.980132,21.615894,3
1,2,Immune,0.526713,4.732870e-01,4.685848e-07,139,0.451320,0.902597,0.763736,1,44.551,14.332834,12.790079,-7.778626,3,51,72,102,116,7.323741,440.561151,3
2,3,Stroma,0.097520,9.024687e-01,1.172531e-05,341,0.515709,0.909333,0.710417,1,73.282,23.538917,20.167292,-57.007481,2,4,51,72,102,9.580645,471.422287,3
3,4,Immune,0.629379,3.706196e-01,1.462787e-06,99,0.563297,0.860870,0.642857,1,41.020,12.721554,10.511238,-5.520296,3,5,6,7,51,8.242424,551.565657,3
4,5,Immune,0.593045,4.069543e-01,8.163475e-07,170,0.805834,0.918919,0.787037,1,50.051,19.553393,11.578378,-9.190809,4,6,7,51,72,8.917647,571.423529,3
5,6,Stroma,0.139360,8.606382e-01,1.545613e-06,234,0.544455,0.955102,0.809689,1,55.840,19.189461,16.095924,-35.502567,4,5,7,51,76,8.102564,601.094017,3
6,7,Stroma,0.046114,9.538843e-01,2.168371e-06,177,0.782697,0.931579,0.756410,1,49.596,19.484076,12.126937,-17.708564,4,5,6,76,78,8.361582,633.101695,3
7,8,Immune,0.528584,4.714114e-01,4.868467e-06,190,0.674201,0.863636,0.620915,1,57.436,19.028697,14.053608,-27.967277,9,65,78,127,200,7.584211,755.247368,3
8,9,Stroma,0.266014,7.339803e-01,5.987392e-06,173,0.838920,0.860697,0.569079,1,55.567,21.274985,11.579012,-24.212983,8,10,65,70,78,8.728324,775.560694,3
9,10,Stroma,0.047446,9.525521e-01,1.470525e-06,116,0.850924,0.928000,0.644444,1,44.816,17.615368,9.253169,-72.342125,9,11,70,128,188,4.482759,859.413793,3


In [19]:
result

Unnamed: 0,CellID,Pred,Stroma_proba,Immune_proba,Tumor_proba
0,1,Tumor,0.270445,0.357990,0.371565
1,2,Immune,0.243228,0.454388,0.302384
2,3,Tumor,0.165315,0.270030,0.564655
3,4,Immune,0.153781,0.500939,0.345280
4,5,Immune,0.245183,0.460618,0.294199
5,6,Immune,0.388551,0.444693,0.166756
6,7,Immune,0.339524,0.486810,0.173666
7,8,Immune,0.123345,0.501378,0.375277
8,9,Immune,0.095129,0.522158,0.382713
9,10,Stroma,0.557082,0.347579,0.095339


### Train Random Forest on all data

In [42]:
with open('train_rf_classifier.param', 'w') as paramf:
    for idx in [1,2,3]:
        paramf.write('--testSample {}\n'.format(idx))

In [44]:
workdir = '/cellar/users/y8qin/Data/image_hackathon/morpho-type/Yue'
slurm.array_script('{}/train_rf_classifier.sh'.format(workdir), 
                   '{}/train_rf_classifier.param'.format(workdir), 
                   '{}/train_rf_classifier.py'.format(workdir), 
                   3, mem=20, ncpu=8, gpu=True)