In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import numpy as np
import os
import glob
import csv

In [23]:
#Extracting files and pathways for model 
#config def at the bottom cell 
interesting_files = glob.glob("./training/*.csv") 
df = pd.concat((pd.read_csv(f, header = 0) for f in interesting_files))
df_deduplicated = df.drop_duplicates()
df_deduplicated.to_csv("./train.csv")

In [15]:
def get_dataset(config):
    df = pd.read_csv(config['train_path'])
    df = df.fillna(0)
    features = config['feature_names']
    targets = config['label_name']
    X_train, X_test, y_train, y_test = train_test_split(
        df[features], df[targets], test_size=config['test_size'], random_state=42)
    data_snapshot = {'x': X_train,
                     'y': y_train,
                     'xtest': X_test,
                     'ytest': y_test,
                     'features': features}

    return data_snapshot


def get_test_samples(config):
    features = config['feature_names']
    targets = config['label_name']
    test_samples = []
    test_path = config['test_path']
    for file_name in os.listdir(test_path):
        df = pd.read_csv(os.path.join(test_path, file_name))
        z = df[config['depth_name']].fillna(0)
        x = df[features].fillna(0)
        y = df[targets].fillna(0)
        test_samples.append((file_name, z, x, y))
    return test_samples


def train_rf(data, param):
    clf = RandomForestClassifier(**param)  # hyperparameters for RFC
    clf.fit(data['x'], data['y'])
    print('Importance of Features:')
    for feature, importance in zip(data['features'], clf.feature_importances_):
        print('{}: {}'.format(feature, importance))

    y_pred = clf.predict(data['xtest'])
    print(classification_report(data['ytest'], y_pred))
    return clf


def train_mlp(data, param):
    clf = MLPClassifier(**param)  # hyperparameters for RFC
    clf.fit(data['x'], data['y'])
    y_pred = clf.predict(data['xtest'])
    print(classification_report(data['ytest'], y_pred))
    return clf


def smooth(series, w=3):
    def smooth_one(i, j, k):
        '''
        check the mode of the w data in front and 2 data after
        if the before-mode and the after-mode is the same
        the current data will be replaced by the mode value
        otherwise, no change
        '''
        mode1 = max(set(series[j:i]), key=series[j:i].count)
        mode2 = max(set(series[i+1:k]), key=series[i+1:k].count)
        if mode1 == mode2 and mode1 != series[i]:
            return mode1
        else:
            return series[i]
    for idx in range(w, len(series)-w):
        series[idx] = smooth_one(idx, idx-w, idx+w+1)
    return series


def plot(clf, config, test_sample):
    file_name, depth, x, y = test_sample
    sign = np.sign(depth.iloc[-1]) ## assume all are depths are in same sign
    yhat = clf.predict(x)
    yhat = smooth(list(yhat), config['window_size'])
    mapping = config['mapping']
    output_path = config['output_path']
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    np.savetxt(os.path.join(output_path, 'prediction_{}'.format(
        file_name)), np.array([depth, yhat, y]).T, fmt="%s", delimiter=',',
        header=','.join([config['depth_name'], 'predicted', 'true']))
    true = y.map(mapping)
    pred = pd.Series(yhat).map(mapping)
    x1 = np.zeros_like(true)
    x2 = np.ones_like(pred)
    t1 = pred
    t2 = true
    t = np.append(t1, t2)
    x = np.append(x1, x2)
    y = np.append(depth, depth)
    fig, ax1 = plt.subplots(figsize=(2, 8))
    plt.title(file_name)
    tick = np.arange(0, sign*depth.shape[0]/10, (depth[1]-depth[0])*100)
    plt.xticks([0, 1],["Predicted","Real"])
    
    plt.yticks(tick)
    plt.ylabel(config['depth_name'])
    cmap = plt.get_cmap('viridis', len(mapping))
    cax = ax1.scatter(x, y, c=t, s=999, marker='s', cmap=cmap)
    cbar = fig.colorbar(cax, ticks=list(mapping.values()))
    cbar.ax.set_yticklabels(list(mapping.keys()))
    plt.savefig(os.path.join(output_path, '{}.png'.format(file_name)), bbox_inches = "tight")
    return

In [None]:
#Hash Map config 

config = {'train_path': './train.csv',
          'test_path': './testing',
          'output_path': './output',
          'feature_names': ['qc', 'fs', 'u2','Level'],
          'label_name': 'Label',
          'depth_name': 'Level',
          'test_size': 0.2,
          'window_size': 8,
          'mapping': {  
                        'MD': 0,
                        'DM':0,
                        'ALL-c (pal)': 1,
                        'ALL-c (unw)': 2,
                        'ALL-c (int)': 3,
                        'ALL-s': 4,
                        'DS': 4,
                        'GRADE V': 5
                      }
          }

data = get_dataset(config)
rf_param = {'n_estimators': 100, 'max_depth': 100}
clf = train_rf(data, rf_param)

####### Uncomment the below two lines for training on mlp ########
#mlp_param = {'hidden_layer_sizes': (10,10,10,10,10,10,10), 'solver': 'adam', 'learning_rate_init': 0.01}
#clf = train_mlp(data, mlp_param)
#################################################################

In [None]:
test_samples = get_test_samples(config)
for test_sample in test_samples:
    print('processing {}'.format(test_sample[0]))
    plot(clf, config, test_sample) ## w is the smoothing window