In [2]:
import matplotlib.patches as patches
import random
from sklearn.utils import shuffle
from tqdm import tqdm
from tqdm import tqdm_notebook

from numpy.random import seed
seed(101)
from tensorflow import set_random_seed
set_random_seed(101)

import pandas as pd
import numpy as np

from shutil import copyfile, move
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import initializers

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.layers import BatchNormalization

import os
import cv2

from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import itertools
import shutil
import matplotlib.pyplot as plt
%matplotlib inline


# 读入表格

In [4]:
data = pd.read_csv('/kaggle/input/train_labels.csv')
train_path = '/kaggle/input/train/'
test_path = '/kaggle/input/test/'
# 为flow from dataframe做准备
data["filename"] = [item.id+".tif" for idx, item in data.iterrows()]
# data["class"] = ["b_has tumor" if item.label==1 else "a_no tumor" for idx, item in data.iterrows()]
data["class"] = ["b_has tumor" if item.label==1 else "a_no tumor" for idx, item in data.iterrows()]
baseline_data = data[:10000]
print(data['label'].value_counts())
print(data.head())

# data = data.sample(10000, random_state = 101)


0    130908
1     89117
Name: label, dtype: int64
                                         id     ...             class
0  f38a6374c348f90b587e046aac6079959adf3835     ...        a_no tumor
1  c18f2d887b7ae4f6742ee445113fa1aef383ed77     ...       b_has tumor
2  755db6279dae599ebb4d39a9123cce439965282d     ...        a_no tumor
3  bc3f0c64fb968ff4a8bd33af6971ecae77c75e08     ...        a_no tumor
4  068aba587a4950175d04c680d38943fd488d6a9d     ...        a_no tumor

[5 rows x 4 columns]


In [None]:
130908+89117

# 数据清洗

In [None]:
def readImage(path):
    bgr_img = cv2.imread(path)
    b,g,r = cv2.split(bgr_img)
    rgb_img = cv2.merge([r,g,b])
    return rgb_img

shuffled_data = shuffle(baseline_data)
# path = os.path.join(train_path, idx)

dark_th = 10      # 黑色图片
bright_th = 245   # 白色图片
too_dark_idx = []
too_bright_idx = []

x_tot = np.zeros(3)
x2_tot = np.zeros(3)
counted_ones = 0
for i, idx in tqdm_notebook(enumerate(shuffled_data['filename']), 'computing statistics...(10000 it total)'):
    path = os.path.join(train_path, idx)
    imagearray = readImage(path).reshape(-1,3)
#     imagearray = readImage(path + '.tif')
    # is this too dark
    if(imagearray.max() < dark_th):
        too_dark_idx.append(idx)
        continue # do not include in statistics
    # is this too bright
    if(imagearray.min() > bright_th):
        too_bright_idx.append(idx)
        continue # do not include in statistics
    x_tot += imagearray.mean(axis=0)
    x2_tot += (imagearray**2).mean(axis=0)
    counted_ones += 1
    


In [None]:
channel_avr = (x_tot/counted_ones)/255
channel_std = (np.sqrt(x2_tot/counted_ones - channel_avr**2))/255
print(channel_avr,channel_std)

In [None]:
print('There was {0} extremely dark image'.format(len(too_dark_idx)))
print('and {0} extremely bright images'.format(len(too_bright_idx)))
print('Dark one:')
print(too_dark_idx)
print('Bright ones:')
print(too_bright_idx)

# 训练验证集分解

## baseline的数据

In [None]:
# from sklearn.model_selection import train_test_split
# train_df = baseline_data

# #If removing outliers, uncomment the four lines below
# print('Before removing outliers we had {0} training samples.'.format(len(train_df)))

# for i in too_dark_idx:
#     train_df =  train_df[train_df['filename'] != i]
    
# for j in too_bright_idx:
#     train_df =  train_df[train_df['filename'] != j]

# print('After removing outliers we have {0} training samples.'.format(len(train_df)))

# train_names = train_df.id.values
# train_labels = np.asarray(train_df['label'].values)

# # split, this function returns more than we need as we only need the validation indexes for fastai
# df_train, df_val= train_test_split(train_df, test_size=0.1, stratify=train_labels, random_state=101)

In [None]:
# df_train = df_train.reset_index(drop=True)

## 全数据

In [None]:
from sklearn.model_selection import train_test_split
train_df = data

#If removing outliers, uncomment the four lines below
print('Before removing outliers we had {0} training samples.'.format(len(train_df)))
# train_df[(~train_df['id'].isin(too_dark_idx))]
# train_df = train_df.drop(labels=too_dark_idx, axis=0)
# train_df = train_df.drop(labels=too_bright_idx, axis=0)
# train_df = train_df[train_df.id != too_dark_idx]
for i in too_dark_idx:
    train_df =  train_df[train_df['filename'] != i]
    
for j in too_bright_idx:
    train_df =  train_df[train_df['filename'] != j]

print('After removing outliers we have {0} training samples.'.format(len(train_df)))
train_df = train_df.reset_index(drop=True)
train_names = train_df.id.values
train_labels = np.asarray(train_df['label'].values)

# split, this function returns more than we need as we only need the validation indexes for fastai
df_train, df_val= train_test_split(train_df, test_size=0.1, stratify=train_labels, random_state=101)

# 读入数据集

In [None]:
num_train_samples = len(df_train)
num_val_samples = len(df_val)
train_batch_size = 10
val_batch_size = 10


train_steps = np.ceil(num_train_samples / train_batch_size)
val_steps = np.ceil(num_val_samples / val_batch_size)

target_size = (96,96)

train_datagen = ImageDataGenerator(
    rescale=1. / 255,
    horizontal_flip=True,
    vertical_flip=True,
    rotation_range=40,
    zoom_range=0.2, 
#     width_shift_range=0.1,
#     height_shift_range=0.1
)

# train_datagen = ImageDataGenerator(
#         rescale=1./255
# )

train_generator = train_datagen.flow_from_dataframe(
    dataframe = df_train,
    x_col='filename',
    y_col='class',
    directory='../input/train/',
    target_size=target_size,
    batch_size=train_batch_size,
    shuffle=True,
    class_mode='binary')


val_datagen = ImageDataGenerator(rescale=1. / 255)
val_generator = val_datagen.flow_from_dataframe(
    dataframe = df_val,
    x_col='filename',
    y_col='class',
    directory='../input/train/',
    target_size=target_size,
    shuffle=False,
    batch_size=val_batch_size,
    class_mode='binary')

test_datagen = ImageDataGenerator(rescale=1. / 255)
test_generator = val_datagen.flow_from_dataframe(
    dataframe = df_val,
    x_col='filename',
    y_col='class',
    directory='../input/train/',
    target_size=target_size,
    shuffle=False,
    batch_size=val_batch_size,
    class_mode='binary')



In [None]:
test_generator.class_indices

## 看下图片

In [None]:
def plot_random_samples(generator):
    generator_size = len(generator)
    index=random.randint(0,generator_size-1)
    image,label = generator.__getitem__(index)

    sample_number = 10
    fig = plt.figure(figsize = (20,sample_number))
    for i in range(0,sample_number):
        ax = fig.add_subplot(2, 5, i+1)
        ax.imshow(image[i])
        if label[i]==0:
            ax.set_title("has tumor")
        elif label[i]==1:
            ax.set_title("no tumor")
    plt.tight_layout()
    plt.show()

In [None]:
plot_random_samples(val_generator)

# 创建模型

## 创建VGG16

In [None]:
dropout_dense=0.3
IMG_SIZE = (96, 96)
IN_SHAPE = (96,96, 3)
base_model = VGG16( 
    weights='imagenet',
    include_top=False,
    input_shape=IN_SHAPE)
base_model.summary()

In [None]:
for layer in base_model.layers[:-8]:
    layer.trainable = False

for layer in base_model.layers:
    print(layer, layer.trainable)

In [None]:
VGG16_model = Sequential()
VGG16_model.add(base_model)
VGG16_model.add(Flatten())
VGG16_model.add(Dense(256, use_bias=False))
VGG16_model.add(BatchNormalization())
VGG16_model.add(Activation("relu"))
VGG16_model.add(Dropout(dropout_dense))
VGG16_model.add(Dense(1, activation = "sigmoid"))
VGG16_model.summary()

## 创建Resnet50

In [None]:
# dropout_dense=0.3
# IMG_SIZE = (96, 96)
# IN_SHAPE = (96,96, 3)
# base_model = ResNet50(
#     weights='imagenet',
#     include_top=False,
#     input_shape=IN_SHAPE
# )


In [None]:
# base_model.summary()

In [None]:
# for layer in base_model.layers[:-6]:
#     layer.trainable = False

# for layer in base_model.layers:
#     print(layer, layer.trainable)

In [None]:
# resnet50_model = Sequential()
# resnet50_model.add(base_model)
# resnet50_model.add(Flatten())
# resnet50_model.add(Dense(256, use_bias=False))
# resnet50_model.add(BatchNormalization())
# resnet50_model.add(Activation("relu"))
# resnet50_model.add(Dropout(dropout_dense))
# resnet50_model.add(Dense(1, activation = "sigmoid"))
# resnet50_model.summary()

In [None]:
# resnet50_model.summary()

In [None]:
# print(val_generator.class_indices)

In [None]:
os.listdir('/kaggle/working')

In [None]:
os.remove('/kaggle/working/ResNet50_model.h5')

## 生成resnet50的模型

In [None]:
# resnet50_model.compile(Adam(lr=0.0001), loss='binary_crossentropy', 
#               metrics=['accuracy'])

# sgd = keras.optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
# model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])

In [None]:
# filepath = "CNN3_model.h5"
# filepath = "CNN6_model.h5"
# filepath = "CNN9_model.h5"
# filepath = "VGG16_model.h5"
# filepath = "ResNet50_model.h5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, 
#                              save_best_only=True, mode='max')

# reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.5, patience=2, 
#                                    verbose=1, mode='max', min_lr=0.00001)
                              
                              
# callbacks_list = [checkpoint, reduce_lr]

# history = resnet50_model.fit_generator(train_generator, steps_per_epoch=train_steps, 
#                     validation_data=val_generator,
#                     validation_steps=val_steps,
#                     epochs=10, verbose=1,
#                    callbacks=callbacks_list)

In [None]:
# resnet50_model.load_weights('ResNet50_model.h5')

# val_loss, val_acc = \
# resnet50_model.evaluate_generator(test_generator, 
#                         steps=len(df_val))

# print('val_loss:', val_loss)
# print('val_acc:', val_acc)

### 准确率画图

In [None]:
# plt.plot(history.history['acc'])
# plt.plot(history.history['val_acc'])
# plt.title('Accuracy over epochs')
# plt.ylabel('Acc')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Validation'], loc='best')
# plt.show()

### loss画图

In [None]:
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('Loss over epochs')
# plt.ylabel('Loss')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Validation'], loc='best')
# plt.show()

## 生成VGG16的模型

In [None]:
VGG16_model.compile(Adam(lr=0.0001), loss='binary_crossentropy', 
              metrics=['accuracy'])

# sgd = keras.optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
# model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])

In [None]:
filepath = "VGG16_model.h5"

checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, 
                             save_best_only=True, mode='max')

reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.5, patience=2, 
                                   verbose=1, mode='max', min_lr=0.00001)
                              
                              
callbacks_list = [checkpoint, reduce_lr]

VGG16_history = VGG16_model.fit_generator(train_generator, steps_per_epoch=train_steps, 
                    validation_data=val_generator,
                    validation_steps=val_steps,
                    epochs=60, verbose=1,
                   callbacks=callbacks_list)

In [None]:
VGG16_model.load_weights('VGG16_model.h5')

val_loss, val_acc = \
VGG16_model.evaluate_generator(test_generator, 
                        steps=len(df_val))

print('val_loss:', val_loss)
print('val_acc:', val_acc)

### 准确率画图

In [None]:
plt.plot(VGG16_history.history['acc'])
plt.plot(VGG16_history.history['val_acc'])
plt.title('Accuracy over epochs')
plt.ylabel('Acc')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='best')
plt.show()

### loss画图

In [None]:
plt.plot(VGG16_history.history['loss'])
plt.plot(VGG16_history.history['val_loss'])
plt.title('Loss over epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='best')
plt.show()

## 预测

In [None]:
predictions = VGG16_model.predict_generator(test_generator, steps=len(df_val), verbose=1)
predictions.shape

In [None]:
# 查看不同类的索引
test_generator.class_indices

In [None]:
df_preds = pd.DataFrame(predictions, columns=['b_has tumor'])
df_preds.head()

In [None]:
test_generator.classes

In [None]:
y_true = test_generator.classes
y_pred = df_preds['b_has tumor']

# RUC score

In [None]:
from sklearn.metrics import roc_curve, auc
# 概率
probs = np.exp(y_pred[:])
# 计算ROC曲线
fpr, tpr, thresholds = roc_curve(y_true, probs, pos_label=1)

# 计算ROC面积
roc_auc = auc(fpr, tpr)
print('ROC area is {0}'.format(roc_auc))

In [None]:
plt.figure()
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([-0.01, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

# confusion matrix

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    画出混淆矩阵
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
predictions = predictions.flatten()
index = 0

for i in range(len(predictions)):
    if predictions[i]>=0.5:
        predictions[i]=1
    else:
        predictions[i]=0
predictions = predictions.astype(int)

test_labels = test_generator.classes
test_labels = np.array(test_labels)
cm = confusion_matrix(test_labels, predictions)
test_generator.class_indices

In [None]:
# 定义类别的索引
cm_plot_labels = ['a_no tumor', 'b_has tumor']

plot_confusion_matrix(cm, cm_plot_labels, title='Confusion Matrix')

# 分类报告

In [None]:
from sklearn.metrics import classification_report

# Generate a classification report

# For this to work we need y_pred as binary labels not as probabilities
y_pred_binary = predictions

report = classification_report(y_true, y_pred_binary, target_names=cm_plot_labels)

print(report)

# 提交

In [None]:
src="../input/test"

test_folder="../test_folder"
dst = test_folder+"/test"
os.mkdir(test_folder)
os.mkdir(dst)

file_list =  os.listdir(src)
with tqdm(total=len(file_list)) as pbar:
    for filename in file_list:
        pbar.update(1)
        copyfile(src+"/"+filename,dst+"/"+filename)
        
test_datagen = ImageDataGenerator(
    rescale=1. / 255)

test_generator = test_datagen.flow_from_directory(
    directory=test_folder,
    target_size=target_size,
    batch_size=1,
    shuffle=False,
    class_mode='binary'
)

In [None]:
pred = VGG16_model.predict_generator(test_generator,verbose=1)

In [None]:
csv_file = open("sample_submission.csv","w")
csv_file.write("id,label\n")
for filename, prediction in zip(test_generator.filenames,pred):
    name = filename.split("/")[1].replace(".tif","")
    csv_file.write(str(name)+","+str(prediction[0])+"\n")
csv_file.close()