In [1]:
import os, gc
import numpy as np
import pandas as pd
from collections import deque
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import average_precision_score

## listup all data

In [2]:
train_defog = os.listdir("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog")
test_defog = os.listdir("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/defog")
train_tdcsfog = os.listdir("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog")
test_tdcsfog = os.listdir("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/tdcsfog")

There are no duplicate users collected on the learning side

In [3]:
set(train_defog) & set(train_tdcsfog)

set()

Don't know the test side, so just in case I put in the process

In [4]:
if len(set(test_defog) & set(test_tdcsfog)) > 0: # is there?
    test_tdcsfog = list(set(test_tdcsfog) - (set(test_defog) & set(test_tdcsfog)))

## Pre-Feature Engineering Learning Columns and Targets

In [5]:
target_cols = ["StartHesitation","Turn","Walking"]
train_cols = ["Time","AccV","AccML","AccAP"]

## Feature Engineering

function to increase learning columns

In [6]:
def feature_engineering(val, clfs, target=None):
    # Cluster and Dimensional mapping analysis for each data
    if clfs[0] is None:
        clfs[0] = MiniBatchKMeans(n_clusters=10, random_state=0, init="random").fit(val[:,1:])
    km = clfs[0].predict(val[:,1:])
    km_oh = np.zeros((val.shape[0],10), dtype=np.uint8) # discrete value change to One-hot
    for i in range(10):
        idx = np.where(km==0)[0]
        km_oh[idx,i] = 1
    if clfs[1] is None:
        clfs[1] = TruncatedSVD(n_components=2, n_iter=10, random_state=0).fit(val[:,1:])
    svd = clfs[1].transform(val[:,1:])
    # Per-user statistics
    cp = 0
    sp = 0
    usrm = np.zeros((val.shape[0], 5*val.shape[1]-5), dtype=np.float16)
    for i in range(val.shape[0]):
        if cp > val[i,0]:
            for t in range(val.shape[1]-1):
                usrm[sp:i,5*t] = np.mean(val[sp:i,t+1])
                usrm[sp:i,5*t+1] = np.std(val[sp:i,t+1])
                usrm[sp:i,5*t+2] = np.max(val[sp:i,t+1])
                usrm[sp:i,5*t+3] = np.min(val[sp:i,t+1])
                usrm[sp:i,5*t+4] = (i-sp)/val.shape[0]
            sp = i
        cp = val[i,0]
    for t in range(val.shape[1]-1):
        usrm[sp:,5*t] = np.mean(val[sp:,t+1])
        usrm[sp:,5*t+1] = np.std(val[sp:,t+1])
        usrm[sp:,5*t+2] = np.max(val[sp:,t+1])
        usrm[sp:,5*t+3] = np.min(val[sp:,t+1])
        usrm[sp:,5*t+4] = (val.shape[0]-sp)/val.shape[0]
    # Cluster and Dimensional mapping analysis for each user
    if clfs[2] is None:
        clfs[2] = MiniBatchKMeans(n_clusters=10, random_state=0, init="random").fit(usrm)
    kmu = clfs[2].predict(usrm)
    kmu_oh = np.zeros((val.shape[0],10), dtype=np.uint8) # discrete value change to One-hot
    for i in range(10):
        idx = np.where(kmu==0)[0]
        kmu_oh[idx,i] = 1
    if clfs[3] is None:
        clfs[3] = TruncatedSVD(n_components=2, n_iter=10, random_state=0).fit(usrm)
    svdu = clfs[3].transform(usrm)
    # Merge waypoints
    marged = np.hstack([val[:,1:],km_oh,svd])
    # Moving average and variance within the same user
    wnd = np.zeros((val.shape[0],60), dtype=np.float16)
    cp = val[0,0]
    window = deque([marged[0,:15]] * 5)
    for i in range(val.shape[0]):
        if cp > val[i,0]:
            window = deque([marged[i,:15]] * 5)
        else:
            window.popleft()
            window.append(marged[i,:15])
        cp = val[i,0]
        wnd[i] = np.hstack([np.mean(window, axis=0),np.std(window, axis=0),np.min(window, axis=0),np.max(window, axis=0)])
    # Analyze the entire merge data
    usrv = np.hstack([svd,svdu])
    if clfs[4] is None:
        clfs[4] = [LinearRegression().fit(usrv, target[:,i]) for i in range(3)]
    reg = np.stack([clfs[4][i].predict(usrv) for i in range(3)]).transpose((1,0))
    if clfs[5] is None:
        clfs[5] = TruncatedSVD(n_components=2, n_iter=10, random_state=0).fit(marged)
    svdm = clfs[5].transform(marged)
    # Marge all
    return np.hstack([marged,wnd,reg,svdm,usrm,kmu_oh,svdu])

## Optimize function to select learn feature

In [7]:
#from sklearn.tree import DecisionTreeRegressor
def get_regressor(totest=False):
    return Ridge(max_iter=200 if totest else 3000,random_state=0) #DecisionTreeRegressor(max_leaf_nodes=32, random_state=0)

# Optimize which features are used and which are not used among the features created by feature engineering
def optimize_use_feature(val, tgt):
    feature_nums = (3,10,2,15,15,15,15,1,1,1,2,15,10,2)
    feature_test = (0,0, 0,1, 1, 1, 1, 1,1,1,0,0, 0, 0)
    
    # Use less data for speed
    use_test = min(max(int(val.shape[0]*0.05), 100000), val.shape[0])
    kf = StratifiedKFold(n_splits=4, random_state=0, shuffle=True)
    index_scores = np.zeros((3,128))
    for c in range(3): # Use a different Feature for each target
        for fold, (train_index, test_index) in enumerate(kf.split(val,tgt[:,c])):
            train_index = train_index[:use_test]
            # Combination of features to optimize
            for i in range(128): # 7bit because it is a combination of 7 features
                feature_index = list(range(sum(feature_nums[:3])))
                curp = feature_index[-1]+1
                for j in range(7):
                    if ((i>>j)&1) != 0:
                        feature_index += list(range(curp,curp+feature_nums[j+3],1))
                    curp += feature_nums[j+3]
                feature_index += list(range(curp,curp+sum(feature_nums[-4:]),1))
                # run test training
                train_val = val[train_index][:, feature_index]
                train_tgt = tgt[train_index, c]
                test_val = val[test_index][:, feature_index]
                test_tgt = tgt[test_index, c]
                clf = get_regressor(totest=True)
                clf.fit(train_val, train_tgt)
                # make score
                pred = np.clip(np.nan_to_num(clf.predict(test_val)),0,1)
                index_scores[c,i] += average_precision_score(test_tgt, pred)
            break # only 1 fold
    # get best features for each target
    best_feature_index = []
    for c in range(3):
        best_index = np.argmax(index_scores[c])
        feature_index = list(range(sum(feature_nums[:3])))
        curp = feature_index[-1]+1
        for j in range(7):
            if ((best_index>>j)&1) != 0:
                feature_index += list(range(curp,curp+feature_nums[j+3],1))
            curp += feature_nums[j+3]
        feature_index += list(range(curp,curp+sum(feature_nums[-4:]),1))
        best_feature_index.append(feature_index)
    return best_feature_index

## Training and Prediction Function

In [8]:
def training(val, target, cols, grp):
    kfold = GroupKFold(n_splits=5)
    clfs = []
    for i, (train_index, test_index) in enumerate(kfold.split(val, target, grp)):
        clfs.append(get_regressor().fit(val[train_index][:, cols], target[train_index]))
    return clfs

def predict(clfs, val, cols):
    result = None
    for c in clfs:
        result = c.predict(val[:, cols]) if result is None else (c.predict(val[:, cols]) + result)
    return result / len(clfs)

## Get training datas

In [9]:
train_dfs = [pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog/"+i)[train_cols+target_cols] for i in train_defog]
train_grp = sum([[i]*len(d) for i,d in enumerate(train_dfs)], [])
train_val = [i[train_cols].values for i in train_dfs]
train_tgt = [i[target_cols].values for i in train_dfs]
del train_dfs
gc.collect()

0

In [10]:
train_val = np.vstack(train_val)
train_tgt = np.vstack(train_tgt)
gc.collect()

21

In [11]:
defog_trans = [None, None, None, None, None ,None]
train_val = feature_engineering(train_val, defog_trans, target=train_tgt)

  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


In [12]:
defog_features = optimize_use_feature(train_val, train_tgt)

## Learning

In [13]:
train_val = train_val.astype(np.float16) # reduce memory
gc.collect()
defog_clf = [training(train_val,train_tgt[:,i],defog_features[i],train_grp) for i in range(len(target_cols))]

In [14]:
del train_val, train_tgt, train_defog, train_grp
gc.collect()

21

## Get training datas

In [15]:
train_dfs = [pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog/"+i)[train_cols+target_cols] for i in train_tdcsfog]
train_grp = sum([[i]*len(d) for i,d in enumerate(train_dfs)], [])
train_val = [i[train_cols].values for i in train_dfs]
train_tgt = [i[target_cols].values for i in train_dfs]
del train_dfs
gc.collect()

0

In [16]:
train_val = np.vstack(train_val)
train_tgt = np.vstack(train_tgt)
gc.collect()

21

In [17]:
tdcsfog_trans = [None, None, None, None, None, None]
train_val = feature_engineering(train_val, tdcsfog_trans, target=train_tgt)

  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


In [18]:
tdcsfog_features = optimize_use_feature(train_val, train_tgt)

## Learning

In [19]:
train_val = train_val.astype(np.float16) # reduce memory
gc.collect()
tdcsfog_clf = [training(train_val,train_tgt[:,i],tdcsfog_features[i],train_grp) for i in range(len(target_cols))]

In [20]:
del train_val, train_tgt, train_tdcsfog, train_grp
gc.collect()

21

## Get Prediction Datas

In [21]:
test_dfs = [pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/defog/"+i)[train_cols] for i in test_defog]
test_val = [i.values for i in test_dfs]
gc.collect()

21

In [22]:
test_val = np.vstack(test_val)
test_val = feature_engineering(test_val, defog_trans)

# Run Predict

In [23]:
test_val = test_val.astype(np.float16) # reduce memory
gc.collect()
test_defog_preds = [np.clip(predict(c, test_val, defog_features[i]), 0, 1) for i,c in enumerate(defog_clf)]

In [24]:
defog_ids = []
for f,d in zip(test_defog,test_dfs):
    fid = f.split(".")[0]
    for t in d.Time.values:
        sid = f"{fid}_{t}"
        defog_ids.append(sid)

In [25]:
del test_defog, test_dfs, test_val
gc.collect()

42

## Get Prediction Datas

In [26]:
test_dfs = [pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/tdcsfog/"+i)[train_cols] for i in test_tdcsfog]
test_val = [i.values for i in test_dfs]
gc.collect()

21

In [27]:
test_val = np.vstack(test_val)
test_val = feature_engineering(test_val, tdcsfog_trans)

## Run Predict

In [28]:
test_val = test_val.astype(np.float16) # reduce memory
gc.collect()
test_tdcsfog_preds = [np.clip(predict(c, test_val, tdcsfog_features[i]), 0, 1) for i,c in enumerate(tdcsfog_clf)]

In [29]:
tdcsfog_ids = []
for f,d in zip(test_tdcsfog,test_dfs):
    fid = f.split(".")[0]
    for t in d.Time.values:
        sid = f"{fid}_{t}"
        tdcsfog_ids.append(sid)

In [30]:
del test_tdcsfog, test_dfs, test_val
gc.collect()

42

## Make Submission File

In [31]:
all_ids = defog_ids + tdcsfog_ids
all_starts = list(test_defog_preds[0]) + list(test_tdcsfog_preds[0])
all_turns = list(test_defog_preds[1]) + list(test_tdcsfog_preds[1])
all_walkings = list(test_defog_preds[2]) + list(test_tdcsfog_preds[2])

In [32]:
df = pd.DataFrame({"Id":all_ids,"StartHesitation":all_starts,"Turn":all_turns,"Walking":all_walkings})
df

Unnamed: 0,Id,StartHesitation,Turn,Walking
0,02ab235146_0,0.0,0.0,1.0
1,02ab235146_1,0.0,0.0,1.0
2,02ab235146_2,0.0,0.0,1.0
3,02ab235146_3,0.0,0.0,1.0
4,02ab235146_4,0.0,0.0,1.0
...,...,...,...,...
286365,003f117e14_4677,1.0,1.0,1.0
286366,003f117e14_4678,1.0,1.0,1.0
286367,003f117e14_4679,1.0,1.0,1.0
286368,003f117e14_4680,1.0,1.0,1.0


In [33]:
df.to_csv("submission.csv", index=False)