In [44]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pydicom
import random 
from tqdm import tqdm

import os
import keras
import pydicom

from sklearn.preprocessing import LabelEncoder
import pickle

def read_pickle(filename):    
    with open(filename, 'rb') as fp:
        return pickle.load(fp)
    
def save_pickle(data,filename):       
    with open(filename, 'wb') as fp:
        pickle.dump(data, fp)  
    
data_path = '../input/' # rsna-pneumonia-detection-challenge/

images_path = data_path + 'stage_2_train_images/'
labels_path = data_path + 'stage_2_train_labels.csv'

detailed_class_info_path = data_path + 'stage_2_detailed_class_info.csv'

class_encoder = LabelEncoder()
   
def merge_dataframes():
    df = pd.read_csv(labels_path)
    details_df = pd.read_csv(detailed_class_info_path)
    df = pd.concat([df,details_df.drop('patientId',1)], 1) 
    print(df.describe())
    print(df.shape[0], 'cases')

    return df


  
def load_ids_and_labels_from_file():
    ids = read_pickle('ids')
    labels = read_pickle('labels')
    return ids,labels
  
def get_ids_and_labels(num_class):
    df = merge_dataframes()
    df['class_id'] = class_encoder.fit_transform(df['class'])

    df.sort_values(by=['patientId', 'class_id'])
    
    ids = df.patientId.tolist()
    
    if(num_class==2):
        labels =  df.Target.tolist() 
    else:
        labels =  df.class_id.tolist() 
    
    save_pickle(ids, 'ids')
    save_pickle(labels, 'labels')

    return ids,labels

      

num_class = 2
ids,labels = get_ids_and_labels(num_class)


                 x            y      ...            height        Target
count  9555.000000  9555.000000      ...       9555.000000  30227.000000
mean    394.047724   366.839560      ...        329.269702      0.316108
std     204.574172   148.940488      ...        157.750755      0.464963
min       2.000000     2.000000      ...         45.000000      0.000000
25%     207.000000   249.000000      ...        203.000000      0.000000
50%     324.000000   365.000000      ...        298.000000      0.000000
75%     594.000000   478.500000      ...        438.000000      1.000000
max     835.000000   881.000000      ...        942.000000      1.000000

[8 rows x 5 columns]
30227 cases


In [45]:
feature_tensors = np.load('vgg16_features.npz')['arr_0']


In [14]:
from sklearn.model_selection import train_test_split

y = labels[:feature_tensors.shape[0]]

X_train, X_test, y_train, y_test = train_test_split(feature_tensors, y, test_size=0.2, random_state=42)


In [15]:
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, roc_curve
import matplotlib.pyplot as plt


def precision_recall(name, clf):  
    y_pred = clf.predict(X_test)
    
    if(num_class==2):
        roc_score = roc_auc_score(y_test, y_pred)
        print('roc_auc_score', roc_score)

    
    if(num_class==3):
        report = classification_report(y_test, y_pred, target_names=class_encoder.classes_)
    else:
        report = classification_report(y_test, y_pred)
        
    print('classification report for', name)
    print( report )
    

def evaluate_classifier(clf,name):    
    clf.fit(X_train,y_train)

    save_pickle(clf,name)

    precision_recall(name, clf)
    
    score = clf.score(X_test,y_test)
    
    print('average_score', round(score,3))

In [16]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0)
name = 'DecisionTreeClassifier'
evaluate_classifier(clf,name)     

roc_auc_score 0.7507587036619295
classification report for DecisionTreeClassifier
              precision    recall  f1-score   support

           0       0.86      0.77      0.81      4092
           1       0.60      0.73      0.66      1950

   micro avg       0.76      0.76      0.76      6042
   macro avg       0.73      0.75      0.74      6042
weighted avg       0.78      0.76      0.76      6042

average_score 0.758


In [17]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

name = 'RandomForestClassifier'

evaluate_classifier(clf,name) 



roc_auc_score 0.7751278291600873
classification report for RandomForestClassifier
              precision    recall  f1-score   support

           0       0.84      0.93      0.88      4092
           1       0.81      0.62      0.70      1950

   micro avg       0.83      0.83      0.83      6042
   macro avg       0.82      0.78      0.79      6042
weighted avg       0.83      0.83      0.82      6042

average_score 0.831


In [18]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

clf = LinearDiscriminantAnalysis()

name = 'LinearDiscriminantAnalysis'

evaluate_classifier(clf,name) 

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

clf = QuadraticDiscriminantAnalysis()

name = 'QuadraticDiscriminantAnalysis'

evaluate_classifier(clf,name) 



roc_auc_score 0.6076768929994736
classification report for LinearDiscriminantAnalysis
              precision    recall  f1-score   support

           0       0.73      0.89      0.81      4092
           1       0.59      0.32      0.42      1950

   micro avg       0.71      0.71      0.71      6042
   macro avg       0.66      0.61      0.61      6042
weighted avg       0.69      0.71      0.68      6042

average_score 0.708




roc_auc_score 0.5091920445146252
classification report for QuadraticDiscriminantAnalysis
              precision    recall  f1-score   support

           0       0.86      0.03      0.05      4092
           1       0.33      0.99      0.49      1950

   micro avg       0.34      0.34      0.34      6042
   macro avg       0.59      0.51      0.27      6042
weighted avg       0.69      0.34      0.19      6042

average_score 0.338


In [19]:
from sklearn.svm import LinearSVC

clf = LinearSVC()
name = 'LinearSVC'

evaluate_classifier(clf,name) 

roc_auc_score 0.5836318520189487
classification report for LinearSVC
              precision    recall  f1-score   support

           0       0.72      0.91      0.81      4092
           1       0.59      0.25      0.35      1950

   micro avg       0.70      0.70      0.70      6042
   macro avg       0.65      0.58      0.58      6042
weighted avg       0.68      0.70      0.66      6042

average_score 0.701




In [20]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()

name = 'GaussianNB'

evaluate_classifier(clf,name) 

roc_auc_score 0.5664775547033611
classification report for GaussianNB
              precision    recall  f1-score   support

           0       0.72      0.78      0.75      4092
           1       0.43      0.35      0.39      1950

   micro avg       0.64      0.64      0.64      6042
   macro avg       0.58      0.57      0.57      6042
weighted avg       0.63      0.64      0.63      6042

average_score 0.643
roc_auc_score 0.5093382961124897
classification report for Dummy stratified
              precision    recall  f1-score   support

           0       0.68      0.69      0.69      4092
           1       0.34      0.33      0.33      1950

   micro avg       0.57      0.57      0.57      6042
   macro avg       0.51      0.51      0.51      6042
weighted avg       0.57      0.57      0.57      6042

average_score 0.575
roc_auc_score 0.5
classification report for Dummy prior
              precision    recall  f1-score   support

           0       0.68      1.00      0.81      

  'precision', 'predicted', average, warn_for)


In [50]:
num_class = 3
ids,labels = get_ids_and_labels(num_class)

                 x            y      ...            height        Target
count  9555.000000  9555.000000      ...       9555.000000  30227.000000
mean    394.047724   366.839560      ...        329.269702      0.316108
std     204.574172   148.940488      ...        157.750755      0.464963
min       2.000000     2.000000      ...         45.000000      0.000000
25%     207.000000   249.000000      ...        203.000000      0.000000
50%     324.000000   365.000000      ...        298.000000      0.000000
75%     594.000000   478.500000      ...        438.000000      1.000000
max     835.000000   881.000000      ...        942.000000      1.000000

[8 rows x 5 columns]
30227 cases


In [None]:
from keras.models import Sequential
from keras.layers import Dropout, Dense, Flatten,BatchNormalization,LeakyReLU
from keras.metrics import categorical_accuracy, binary_accuracy
import keras
from sklearn.model_selection import train_test_split



if(num_class==2):
    
    y = labels[:feature_tensors.shape[0]]
    y = np.array(y)
    X_train, X_test, y_train, y_test = train_test_split(feature_tensors, y, test_size=0.2, random_state=42)
    
    model = Sequential()
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=[binary_accuracy])

else:
    y = labels[:feature_tensors.shape[0]]
    y = keras.utils.to_categorical(y, num_class)
    X_train, X_test, y_train, y_test = train_test_split(feature_tensors, y, test_size=0.2, random_state=42)
    
    model = Sequential()
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=[categorical_accuracy])



model.fit(X_train,y_train,
          epochs=12,
          batch_size=16,
          validation_data=(X_test, y_test)
         )

model.save('vgg16' + num_class + 'class' + '.h5') 