In [1]:
import pandas as pd
pd.options.display.max_columns = 50
import numpy as np
import sys
import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score

from train_utils import my_GridSearchCV
sys.path.append('/cellar/users/y8qin/Data/my_utils/')
from file_utils import *
import slurm_utils as slurm

%load_ext autoreload
%autoreload 2

In [2]:
lung = load_obj('/cellar/users/y8qin/Data/image_hackathon/morpho-type/Yue/lung.pkl')
train_lung = lung[lung['sample'].isin([1,2])]
test_lung = lung[lung['sample'] == 3]

In [3]:
train_lung.head(2)

Unnamed: 0,CellID,Label,Immune,Stroma,Tumor,Area,Eccentricity,Solidity,Extent,EulerNumber,Perimeter,MajorAxisLength,MinorAxisLength,Orientation,Neighbor_1,Neighbor_2,Neighbor_3,Neighbor_4,Neighbor_5,X_position,Y_position,sample
0,1,Stroma,0.084226,0.913542,0.002232,236,0.957049,0.810997,0.428312,1,78.7,34.044535,9.870373,-29.378275,2,570,598,643,654,13.957627,467.186441,1
1,2,Stroma,0.279038,0.719192,0.00177,201,0.638133,0.922018,0.744444,1,53.45,18.572493,14.299456,-46.545586,1,3,4,226,227,7.666667,525.970149,1


In [4]:
morpho_feat = ['Area', 
               'Eccentricity', 
               'Solidity', 
               'Extent',  
               'Perimeter', 
               'MajorAxisLength', 
               'MinorAxisLength']

In [5]:
train_X = train_lung[morpho_feat].values
test_X = test_lung[morpho_feat].values

label_map = {'Stroma':0, 'Immune':1, 'Tumor':2}
train_y = train_lung['Label'].map(label_map).values
test_y = test_lung['Label'].map(label_map).values

In [6]:
# Random Forest
rf_kwargs = {
    'n_estimators': 200,
    'class_weight': 'balanced', 
    'n_jobs':8}
rf_tuning = {
    'min_samples_split': [.05, .1, .2],
    'max_features': ['sqrt', .5, .75]}
rf = Pipeline(
    [('classification', my_GridSearchCV(
         RandomForestClassifier,
         rf_kwargs, rf_tuning, scoring='f1_macro'))])

In [None]:
rf.fit(train_X[:200], train_y[:200])

In [None]:
pred = rf.predict(test_X)
pred_proba = rf.predict_proba(test_X)

In [None]:
accuracy_score(test_y, pred)

### Train Random Forest on all data

In [42]:
with open('train_rf_classifier.param', 'w') as paramf:
    for idx in [1,2,3]:
        paramf.write('--testSample {}\n'.format(idx))

In [44]:
workdir = '/cellar/users/y8qin/Data/image_hackathon/morpho-type/Yue'
slurm.array_script('{}/train_rf_classifier.sh'.format(workdir), 
                   '{}/train_rf_classifier.param'.format(workdir), 
                   '{}/train_rf_classifier.py'.format(workdir), 
                   3, mem=20, ncpu=8, gpu=True)