# 偉裕生技 銀耳專案

In [1]:
import os
import fnmatch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

rootpath = '/media/share/data/Tremella_fuciformis/'

# Data cleaning

In [None]:
def label_cleaning(root_path):
    
    import os
    import numpy as np
    import pandas as pd
    import fnmatch

    dirname = os.listdir(rootpath)

    filename = []

    for pname, dname, fname in os.walk(rootpath):
        files = fnmatch.filter(fname, '*.xlsx')
        filename.append(files)

    filename = list(filter(None, filename))
    
    dirname.remove('T8-170804')

    df_all = pd.DataFrame([])
    for name in dirname:
        subdirpath = os.path.join(rootpath, name)
        csvname = fnmatch.filter(os.listdir(subdirpath), '*.xlsx')
        df0 = pd.read_excel(os.path.join(subdirpath, csvname[0]), header=1)
        
        df0_temp = pd.concat([df0['號碼.1'], df0['採收重量'], df0['Unnamed: 28'], df0['新品種']], 1)
        df0_temp = df0_temp.dropna().reset_index(drop=True)
        
        df_sn = pd.DataFrame(np.repeat([name], len(df0_temp)), columns=['sn'])
        df_temp = pd.concat([df_sn, df0_temp], 1)
    
        df_all = pd.concat([df_all, df_temp], 0)

    df_all.columns = ['SN', 'CanSN', 'Weight_mean', 'Weight_std', 'Variety']
#     df_all.to_csv(rootpath+'TF_df.csv', header=None)
    
    return df_all

In [None]:
df = label_cleaning(rootpath)

# replace chinese filename with numbers
df.CanSN = df.CanSN.replace('九', '9')
df.CanSN = df.CanSN.replace('十', '10')
df.CanSN = df.CanSN.replace('十一', '11')
df.CanSN = df.CanSN.replace('6無單', '6')
df_train = df.reset_index(drop=True)
df_train.head(5)

# EDA

In [None]:
df.describe()

In [None]:
# stacking
c50p = []
for idx in range(len(df_train)):
    target_v = df['Weight_mean'].iloc[idx]
    if target_v < 60:
        c50p.append('F')
    else:
        c50p.append('C')


df_train['c50p'] = c50p

## ERROR NOTES
- T8-170705, CAN-SN 'S3'不存在影像， 排出此資料
- T8-170804, 影像資料編號與檔案編號對不起來,排除此資料夾

In [None]:
df_train = df_train.drop([7], 0).reset_index(drop=True)

In [2]:
from skimage.transform import resize
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
le = LabelEncoder().fit(['C', 'F'])

def img_gen(indexlist):
    img_file = []
    label = []
    for idx in indexlist:
        subpath = os.path.join(rootpath, df_train['SN'][idx])
        target = str(df_train['CanSN'][idx]).split('.')[0]
        img = plt.imread(subpath + '/image_' + target +'.jpg')
        if img.shape[0] > img.shape[1]:
            img = np.transpose(img, (1, 0, 2))

        img_file.append(resize(img, (256, 256, 3), mode='reflect'))
        label.append(df_train['c50p'][idx])
    
    # demean
    demean_img = img_file - np.mean(img_file)
    labels = to_categorical(le.transform(label), num_classes=2)
    return np.array(demean_img), labels

Using TensorFlow backend.


In [None]:
 df_train

# Densenet

In [3]:
import keras.backend as K
from keras.applications import densenet
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras.callbacks import Callback, EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import datetime

K.clear_session()
base_model = densenet.DenseNet121(include_top=False, weights='imagenet', input_shape=(256, 256, 3))
x = GlobalAveragePooling2D()(base_model.output)
x = Dense(2, activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=x)
model.summary()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 256, 256, 3)  0                                            
__________________________________________________________________________________________________
zero_padding2d_1 (ZeroPadding2D (None, 262, 262, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1/conv (Conv2D)             (None, 128, 128, 64) 9408        zero_padding2d_1[0][0]           
__________________________________________________________________________________________________
conv1/bn (BatchNormalization)   (None, 128, 128, 64) 256         conv1/conv[0][0]                 
__________________________________________________________________________________________________
conv1/relu

# Trainning

In [5]:
from sklearn.model_selection import StratifiedKFold
n_flod = 5
df_train = pd.read_csv(rootpath + 'T8_train.csv')

skf = StratifiedKFold(n_flod, shuffle=True, random_state=np.random)
for train_idx, val_idx in skf.split(df_train, df_train['c50p']):
    break
    
print(len(train_idx))

92


In [None]:
train_gen = ImageDataGenerator(rotation_range=45,
                               zoom_range=[0.8, 1.2], 
                               horizontal_flip=True, 
                               vertical_flip=True, 
                               rescale=1./255.)

In [None]:
X_train, y_train = img_gen(train_idx)
X_val, y_val = img_gen(val_idx)

In [None]:
test = np.mean(X_train, axis=0)

In [None]:
from sklearn.utils import class_weight

class_weight = class_weight.compute_class_weight('balanced', np.unique(df_train['c50p']), df_train['c50p'])

In [None]:
model_checkpoint = ModelCheckpoint('fungus2-dsnet-{epoch:02d}-{val_acc:.4f}.hdf5',
                                   monitor='val_acc', save_best_only=True, save_weights_only=True)

model_earlystop = EarlyStopping(patience=16, monitor='val_acc')

batch_size = 32

train_history = model.fit_generator(train_gen.flow(X_train, y_train, batch_size), 
                                    epochs=2**7, steps_per_epoch=len(X_train), 
                                    validation_data=train_gen.flow(X_val, y_val, batch_size), 
                                    validation_steps=len(X_val),
                                    verbose=2, callbacks=[model_checkpoint, model_earlystop])

# Evaluation

In [None]:
plt.plot(train_history.history['acc'])
plt.plot(train_history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
plt.plot(train_history.history['loss'])
plt.plot(train_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
model.load_weights('fungus-dsnet-48-0.8698.hdf5')
model.evaluate(X_val, y_val)

In [None]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_val).argmax(axis=-1)
print(classification_report(y_val.argmax(axis=-1), y_pred, target_names=['A-60', 'B-60']))

In [None]:
model.save('Tremella_fuciformis.hdf5')

# Prediction

In [None]:
img = plt.imread('/media/share/data/Tremella_fuciformis/T8-170705/image_5.jpg')
img = resize(img, (256, 256, 3), mode='reflect')
# img = img[np.newaxis, :, :, :]

In [None]:
y_pred = model.predict(img)