In [1]:
import pandas as pd
import numpy as np
import os
import random
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.metrics import log_loss, confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold
import seaborn as sns
from sklearn.model_selection import train_test_split as tts
%matplotlib inline

# CHAPTER 2 STUFF

In [10]:
def chapter2():
    path = r'C:/Users/MICK/Desktop/ML4QS/ML4QS/Python3Code/datasets/Assignment3/'
    filenames = ['heart_rate', 'motion']
    
    dfs = []
    for i, id_ in enumerate(os.listdir(path)):
        print(i, id_)

        df = pd.read_csv(f'{path}{id_}/labels.csv')
        df.index = pd.to_datetime(df['time'], unit='s')
        df = df[df['time'] > 0].drop('time', 1)
        df['personid'] = id_
        df = df.resample('10s').asfreq().fillna(method='bfill', limit=1).fillna(method='ffill', limit=1)

        for filename in filenames:
            dfx = pd.read_csv(f'{path}{id_}/{filename}.csv')
            dfx.index = pd.to_datetime(dfx['time'], unit='s')
            dfx = dfx[dfx['time'] > 0].drop('time', 1)
                    
            dfmean = dfx.resample('10s').mean()
            dfstd = dfx.resample('10s').std()
            for col in dfstd.columns:
                dfstd.rename(columns={col: col+'_std'}, inplace=True)

            dfy = pd.merge(dfmean, dfstd, left_index=True, right_index=True)

            df = pd.merge(df, dfy, left_index=True, right_index=True)

        dfs.append(df)


    dataset = pd.concat(dfs, sort=False)
    for label in dataset['label'].unique():
        dataset[f'label_{label}'] = np.where(dataset['label'] == label, 1, 0)

    dataset = dataset.drop('label', 1)
    dataset = dataset[dataset['label_-1.0'] == 0].drop('label_-1.0', 1)
    dataset['label_3.0'] = np.where(dataset['label_4.0'] == 1, 1, dataset['label_3.0'])
    dataset = dataset.drop('label_4.0', 1)
    dataset.reset_index().to_csv('chapter2_result.csv', index=False)

In [11]:
chapter2()

0 1066528
1 1360686
2 1449548
3 1455390
4 1818471
5 2598705
6 2638030
7 3509524
8 3997827
9 4018081
10 4314139
11 4426783
12 46343
13 5132496
14 5383425
15 5498603
16 5797046
17 6220552
18 759667
19 7749105
20 781756
21 8000685
22 8173033
23 8258170
24 844359
25 8530312
26 8686948
27 8692923
28 9106476
29 9618981
30 9961348


In [None]:
path = r'C:/Users/MICK/Desktop/ML4QS/ML4QS/Python3Code/intermediate_datafiles/Assignment3/'
df = pd.read_csv(path+'chapter5_result.csv')

In [None]:
df[df.columns[:10]].describe()

In [None]:
labels = ['label_0', 'label_1', 'label_2', 'label_3', 'label_5', 'label_-1', 'label_4']
df['target'] = df[labels].idxmax(axis=1)
df.drop(labels, 1, inplace=True)

In [None]:
test_ids = random.sample(list(df['personid'].unique()), 11)
testset = df[df['personid'].isin(test_ids)]
df = df[~(df['personid'].isin(test_ids))]

testset.drop(['time', 'personid'], 1, inplace=True)
df.drop(['time', 'personid'], 1, inplace=True)

In [None]:
def kfold_lightgbm(train_df, test_df, num_folds, stratified = False, debug= False, seed = 1001):
    # Divide in training/validation and test data
        
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=seed)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=seed)
    # Create arrays and dataframes to store results
    oof_preds = np.empty((train_df.shape[0], 7))
    sub_preds = np.empty((test_df.shape[0], 7))
    feature_importance_split_df = pd.DataFrame()
    feature_importance_gain_df = pd.DataFrame()
    fold_prediction = test_df.copy()

    feats = [f for f in train_df.columns if f not in ['personid', 'time', 'target']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['target'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            metric = 'multi_error',
            nthread=-1,
            n_estimators=10000,
            learning_rate=0.15,
            num_leaves=4,
            colsample_bytree=.1,
            subsample=0.4,
            max_depth=3,
            reg_alpha=1,
            reg_lambda=1.2,
            min_child_samples = 1000,
            min_child_weight = 120,
            min_data_per_group = 1000,
            min_data_in_bin = 500,
            silent=-1,
            verbose=-1)

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'multi_error', verbose= 100, early_stopping_rounds= 200)
        
        class_order = clf.classes_
        
        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)

        test_pred =  clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)
        sub_preds = test_pred
        
        #fold_prediction.loc[:, str(seed) + '_fold_' + str(n_fold)] = test_pred

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.booster_.feature_importance(importance_type = 'split')
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_split_df = pd.concat([feature_importance_split_df, fold_importance_df], axis=0)
        
        fold_importance_df2 = pd.DataFrame()
        fold_importance_df2["feature"] = feats
        fold_importance_df2["importance"] = clf.booster_.feature_importance(importance_type = 'gain')
        fold_importance_df2["fold"] = n_fold + 1
        feature_importance_gain_df = pd.concat([feature_importance_gain_df, fold_importance_df2], axis=0)        
        
        
        #return valid_y, oof_preds[valid_idx]
        print('Fold %2d Logloss : %.6f' % (n_fold + 1, log_loss(valid_y, oof_preds[valid_idx].astype(float))))
        print()
        del clf, train_x, train_y, valid_x, valid_y

    print('Full Logloss Training score %.6f' % log_loss(train_df['target'], oof_preds.astype(float)))
    print('Full Logloss Test score %.6f' % log_loss(test_df['target'], sub_preds.astype(float)))
 
    display_importances(feature_importance_split_df)
    display_importances(feature_importance_gain_df)
    return feature_importance_split_df, feature_importance_gain_df, sub_preds, class_order

# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", 
                                                                                                   ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()

In [None]:
split, gain, preds, order = kfold_lightgbm(df, testset, 5)

In [None]:
adict = {}
for i in range(len(order)):
    adict[i] = order[i]

preds = [adict[i] for i in preds.argmax(axis=1)]

In [None]:
confusion_matrix(preds, testset['target'])