In [None]:
import numpy as np 
import pandas as pd 
import os
import glob
from sklearn import cluster
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from tqdm.auto import tqdm
base_path = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/'

train_defog = glob.glob(base_path+'train/defog/**')
train_tdcsfog = glob.glob(base_path+'train/tdcsfog/**')

subjects = pd.read_csv(base_path+'subjects.csv')
tasks = pd.read_csv(base_path+'tasks.csv')
sub = pd.read_csv(base_path+'sample_submission.csv')

#Load All Data
tdcsfog_metadata=pd.read_csv(base_path+'tdcsfog_metadata.csv')
defog_metadata=pd.read_csv(base_path+'defog_metadata.csv')


In [None]:
args ={ 'NFOLD': 5,          
        'WINDOW': 5_000,     
        'STRIDE': 5_000,     
        'NSAMPLE' : 1000000, 
       }
subjects.fillna(subjects.median(), inplace=True)
subjects.isnull().sum()

tasks["Duration"] = tasks["End"] - tasks["Begin"]
tasks = tasks.pivot_table(values='Duration', index='Id', columns='Task', aggfunc='sum', fill_value=0)

def get_base_name(path):
    return os.path.basename(path).split('.')[0]

tasks = tasks.reset_index()
task_predict_columns =tasks.columns[1:]
tasks['task_kmeans'] = cluster.KMeans(n_clusters=10, random_state=3).fit_predict(tasks[task_predict_columns])

subect_predict_columns = subjects.columns[1:]
subjects['Sex'] = subjects['Sex'].astype('category').cat.codes
subjects['sub_kmeans'] = cluster.KMeans(n_clusters=10, random_state=3).fit_predict(subjects[subect_predict_columns])

defog_metadata['Medication'] = defog_metadata['Medication'].astype('category').cat.codes
defog_metadata = defog_metadata.merge(subjects,how='left',on='Subject')
defog_train_files =[ get_base_name(file) for file in glob.glob(base_path+'train/defog/**')]
defog_meta_train = defog_metadata[defog_metadata['Id'].isin(defog_train_files)].reset_index(drop=True)


tdcsfog_metadata['Medication'] = tdcsfog_metadata['Medication'].astype('category').cat.codes
tdcsfog_metadata = tdcsfog_metadata.merge(subjects,how='left',on='Subject')
tdcsfog_train_files =[ get_base_name(file) for file in glob.glob(base_path+'train/tdcsfog/**')]
tdcsfog_meta_train = tdcsfog_metadata[tdcsfog_metadata['Id'].isin(tdcsfog_train_files)].reset_index(drop=True)

In [None]:


def format_input(df,f, mode):
    try:
        df["Id"] = get_base_name(f)
        df['Time_frac']=(df.index/df.index.max()).values
        df = pd.merge(df, tasks[['Id','t_kmeans']], how='left', on='Id').fillna(-1)
        if mode == "defog":
            df = pd.merge(df, defog_metadata[['Id','Subject', 'Visit_x','Medication','s_kmeans']], how='left', on='Id').fillna(-1)
        elif mode == "tdcsfog":
            df = pd.merge( df, tdcsfog_metadata[['Id','Subject', 'Visit_x','Test','Medication','s_kmeans']], how='left', on='Id').fillna(-1)
        else: print("Error mode must be in 'defog' or 'tdcsfog' ")
        df_feats = fcollection.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin").astype(np.float32)
        df = df.merge(df_feats, how="left", left_index=True, right_index=True)
        df.fillna(method="ffill", inplace=True)
        return df
    except: pass
def data_loader(f, mode):
    try:
        df = pd.read_csv(f, index_col="Time", usecols=['Time', 'AccV', 'AccML', 'AccAP', 'StartHesitation', 'Turn' , 'Walking'])
        df = format_input(df,f, mode)
        return df
    except: pass

In [None]:

from sklearn.model_selection import GroupKFold

choices = [1, 2, 3]
NFOLDS = args["NFOLD"]

def make_batch(mode):
    kfold = GroupKFold(NFOLDS)
    groups = None
    if(mode == "defog"):
        groups=kfold.split(defog_meta_train, groups=defog_meta_train.Subject)
    elif mode == "tdcsfog" :
        groups=kfold.split(tdcsfog_meta_train, groups=tdcsfog_meta_train.Subject)
    else: print("Error mode must be in 'defog' or 'tdcsfog' ")
    groups=list(groups)

    batch_sizes = []
    total_records = 0
    for batch, group in enumerate(groups):
        batch_list = None
        if(mode == "defog"):
            batch_list = defog_meta_train.loc[group[1],['Id']].drop_duplicates().apply(
                lambda x:f"{mode}/{x['Id']}.csv",axis=1
            ).tolist()
        elif mode == "tdcsfog" :
            batch_list = tdcsfog_meta_train.loc[group[1],['Id']].drop_duplicates().apply(
                lambda x:f"{mode}/{x['Id']}.csv",axis=1
            ).tolist()
        else: print("Error mode must be in 'defog' or 'tdcsfog' ")

        train_batch=pd.concat([data_loader(base_path + "train/" + file, mode) for file in tqdm(batch_list)])
        conditions = [
            (train_batch['StartHesitation'] == 1),
            (train_batch['Turn'] == 1),
            (train_batch['Walking'] == 1)]

        train_batch['event'] = np.select(conditions, choices, default=0)
        train_batch['batch_id'] = batch
        batch_sizes.append(train_batch.shape[0])
        total_records = total_records + train_batch.shape[0]
        print(train_batch.shape)
        train_batch.to_parquet(f"{mode}.pq",partition_cols=['batch_id'])
    
    del train_batch
    del groups
    return [total_records - batch_size for batch_size in batch_sizes] #batch_sizes

defog_batch_sizes = make_batch("defog")
tdcsfog_batch_sizes = make_batch("tdcsfog")


In [None]:
# #Used for Param Tuning
# !pip install optuna
# import optuna  

# from sklearn.datasets import make_regression
# from sklearn.multioutput import MultiOutputRegressor
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.model_selection import cross_val_score, cross_validate 
# from sklearn.metrics import mean_squared_error, average_precision_score
# from sklearn.model_selection import train_test_split
# from lightgbm import LGBMRegressor
# import lightgbm as lgb
# from optuna.integration import LightGBMPruningCallback

# from sklearn.model_selection import GroupKFold




# pred_cols = ['StartHesitation', 'Turn' , 'Walking']    

# def objective(trial):
#     lgbm_params ={
#             'boosting_type': 'gbdt',
#             'objective': 'multiclass',
#             'metric' : 'multi_logloss', 
#             'early_stopping_round': 100,
#             'num_class' : 4, 
#             'verbose': -1,
#             "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
#             "max_depth": trial.suggest_int("max_depth", 3, 12),
#             'num_iterations': trial.suggest_int("num_iterations", 1000, 20000, step=500),
#             'max_bin': trial.suggest_int("max_bin", 100, 255, step=20),
#             'num_leaves': trial.suggest_int("num_leaves", 5, 31),
#             "bagging_fraction": trial.suggest_float(
#                 "bagging_fraction", 0.1, 1.0, step=0.1
#             ),
#             "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
#             "feature_fraction": trial.suggest_float(
#                 "feature_fraction", 0.2, 0.95, step=0.1
#             ),
#             "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
#             "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
#            }
#     regressors = []
#     stats=[]
#     for batch in tqdm(range(1), total=1, desc="Folds"):
#         #Set dynamic filters based on batch_id
#         train_filter = [[('batch_id','=',b)] for b in range(NFOLDS) if b != batch]
#         test_filter = [('batch_id','=',batch)]

#         train=pd.read_parquet(f'{mode}.pq',filters=train_filter)
#         if(len(train) > args['NSAMPLE']):
#             train = train.sample(n=args['NSAMPLE']).reset_index(drop=True)
#         test=pd.read_parquet(f'{mode}.pq',filters=test_filter)
#         if(len(test) > args['NSAMPLE']):
#             test = test.sample(n=args['NSAMPLE']).reset_index(drop=True)
#         cols = train.columns.difference(['Id', 'Subject', 'Set', 'Time', 'StartHesitation', 'Turn', 'Walking', 'Valid', 'Task', 'Event', "event", "batch_id"]).tolist()

#         y_test = test['event']
#         y_pcols = test[pred_cols]
#         X_test = test[cols]
#         train_set = lgb.Dataset(train[cols], label=train['event'])
#         test_set = lgb.Dataset(X_test, label=y_test, free_raw_data=False)

#         lgbm_regress = lgb.train(
#             params = lgbm_params,
#             train_set = train_set,
#             valid_sets = [train_set, test_set],
#             callbacks=[
#                 LightGBMPruningCallback(trial, "multi_logloss", valid_name="valid_1")
#             ],  
#             )

#         regressors.append(lgbm_regress)
#         del train_set
#         cv=mean_squared_error(y_pcols, lgbm_regress.predict(X_test)[:,1:].clip(0.0,1.0))
#         stats.append(cv)
#         del test_set
#     return np.mean(stats)

# mode = "tdcsfog" #defog
# study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
# func = lambda trial: objective(trial)
# study.optimize(func, n_trials=50)
# for key, value in study.best_params.items():
#     print(f"\t\t{key}: {value}")
    
    

In [None]:
# #Used for Param Tuning
# !pip install optuna
# import optuna  

# from sklearn.datasets import make_regression
# from sklearn.multioutput import MultiOutputRegressor
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.model_selection import cross_val_score, cross_validate 
# from sklearn.metrics import mean_squared_error, average_precision_score
# from sklearn.model_selection import train_test_split
# from lightgbm import LGBMRegressor
# import lightgbm as lgb
# from optuna.integration import LightGBMPruningCallback

# from sklearn.model_selection import GroupKFold




# pred_cols = ['StartHesitation', 'Turn' , 'Walking']    

# def objective(trial):
#     lgbm_params ={
#             'boosting_type': 'gbdt',
#             'objective': 'multiclass',
#             'metric' : 'multi_logloss', 
#             'early_stopping_round': 100,
#             'num_class' : 4, 
#             'verbose': -1,
#             "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
#             "max_depth": trial.suggest_int("max_depth", 3, 12),
#             'num_iterations': trial.suggest_int("num_iterations", 1000, 20000, step=500),
#             'max_bin': trial.suggest_int("max_bin", 100, 255, step=20),
#             'num_leaves': trial.suggest_int("num_leaves", 5, 31),
#             "bagging_fraction": trial.suggest_float(
#                 "bagging_fraction", 0.1, 1.0, step=0.1
#             ),
#             "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
#             "feature_fraction": trial.suggest_float(
#                 "feature_fraction", 0.2, 0.95, step=0.1
#             ),
#             "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
#             "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
#            }
#     regressors = []
#     stats=[]
#     for batch in tqdm(range(1), total=1, desc="Folds"):
#         #Set dynamic filters based on batch_id
#         train_filter = [[('batch_id','=',b)] for b in range(NFOLDS) if b != batch]
#         test_filter = [('batch_id','=',batch)]

#         train=pd.read_parquet(f'{mode}.pq',filters=train_filter)
#         if(len(train) > args['NSAMPLE']):
#             train = train.sample(n=args['NSAMPLE']).reset_index(drop=True)
#         test=pd.read_parquet(f'{mode}.pq',filters=test_filter)
#         if(len(test) > args['NSAMPLE']):
#             test = test.sample(n=args['NSAMPLE']).reset_index(drop=True)
#         cols = train.columns.difference(['Id', 'Subject', 'Set', 'Time', 'StartHesitation', 'Turn', 'Walking', 'Valid', 'Task', 'Event', "event", "batch_id"]).tolist()

#         y_test = test['event']
#         y_pcols = test[pred_cols]
#         X_test = test[cols]
#         train_set = lgb.Dataset(train[cols], label=train['event'])
#         test_set = lgb.Dataset(X_test, label=y_test, free_raw_data=False)

#         lgbm_regress = lgb.train(
#             params = lgbm_params,
#             train_set = train_set,
#             valid_sets = [train_set, test_set],
#             callbacks=[
#                 LightGBMPruningCallback(trial, "multi_logloss", valid_name="valid_1")
#             ],  
#             )

#         regressors.append(lgbm_regress)
#         del train_set
#         cv=mean_squared_error(y_pcols, lgbm_regress.predict(X_test)[:,1:].clip(0.0,1.0))
#         stats.append(cv)
#         del test_set
#     return np.mean(stats)

# mode = "defog" #defog
# study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
# func = lambda trial: objective(trial)
# study.optimize(func, n_trials=50)
# for key, value in study.best_params.items():
#     print(f"\t\t{key}: {value}")
    
    

In [None]:
from sklearn.datasets import make_regression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, cross_validate 
from sklearn.metrics import mean_squared_error, average_precision_score
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
import lightgbm as lgb

from sklearn.model_selection import GroupKFold

#default estimations
lgbm_params ={
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'metric' : 'multi_logloss', 
        'early_stopping_round': 100,
        'num_class' : 4, 
#         'verbose': -1,
        "learning_rate":  0.2,
        "max_depth": 6,
        'num_iterations': 5000,
        'max_bin': 240,
        'num_leaves':12,
        "bagging_fraction": 0.75,
        "bagging_freq": 1,
        "feature_fraction": 0.50,
        "lambda_l1": 15,
        "lambda_l2": 90,
        'min_child_weight': 3.1,
        'extra_trees': True,
       }



    
pred_cols = ['StartHesitation', 'Turn' , 'Walking']    
cols = None
defog_regressors = []
tdcsfog_regressors = []

defog_stats=[]
tdcsfog_stats=[]

def train_loop(mode):
    
    if(mode == "defog"):
        lgbm_params["learning_rate"] =  0.15
        lgbm_params["max_depth"] = 9
        lgbm_params['num_iterations'] = 13000
        lgbm_params['max_bin'] = 200
        lgbm_params['num_leaves'] = 26
        lgbm_params["bagging_fraction"] = 0.2
        lgbm_params["bagging_freq"] = 1
        lgbm_params["feature_fraction"] = 0.8
        lgbm_params["lambda_l1"] = 60
        lgbm_params["lambda_l2"] = 45

    elif mode == "tdcsfog" :
        lgbm_params["learning_rate"] =  0.08
        lgbm_params["max_depth"] = 6
        lgbm_params['num_iterations'] = 13000
        lgbm_params['max_bin'] = 220
        lgbm_params['num_leaves'] = 22
        lgbm_params["bagging_fraction"] = 0.6
        lgbm_params["bagging_freq"] = 1
        lgbm_params["feature_fraction"] = 0.7
        lgbm_params["lambda_l1"] = 40
        lgbm_params["lambda_l2"] = 15

    else: print("Error mode must be in 'defog' or 'tdcsfog' ")
        
    for batch in tqdm(range(NFOLDS), total=NFOLDS, desc="Folds"):

        #Set dynamic filters based on batch_id
        train_filter = [[('batch_id','=',b)] for b in range(NFOLDS) if b != batch]
        test_filter = [('batch_id','=',batch)]
        
#         iterations = None
#         MAX_ITERATIONS = 1
#         if(mode == "defog"):
#             iterations  = round(defog_batch_sizes[batch] / args['NSAMPLE'])
#         elif mode == "tdcsfog" :
#             iterations  = round(tdcsfog_regressors[batch] / args['NSAMPLE'])
#         else: print("Error mode must be in 'defog' or 'tdcsfog' ")
        
        iterations = 1
        for iter in range(iterations):
            last_iter = iter+1 == iterations

            train=pd.read_parquet(f'{mode}.pq',filters=train_filter)
            if(len(train) > args['NSAMPLE']):
                train = train.sample(n=args['NSAMPLE']).reset_index(drop=True)
            test=pd.read_parquet(f'{mode}.pq',filters=test_filter)
            if(len(test) > args['NSAMPLE']):
                test = test.sample(n=args['NSAMPLE']).reset_index(drop=True)
            cols = train.columns.difference(['Id', 'Subject', 'Set', 'Time', 'StartHesitation', 'Turn', 'Walking', 'Valid', 'Task', 'Event', "event", "batch_id"]).tolist()
            train_set = lgb.Dataset(train[cols], label=train['event'])
            test_set = lgb.Dataset(test[cols], label=test['event'])

            if ( iter == 0):
                lgbm_regress = lgb.train(
                    params = lgbm_params,
                    train_set = train_set,
                    valid_sets = [train_set, test_set],
                    keep_training_booster = True
                    )
            else:
                lgbm_regress = lgb.train(
                    params = lgbm_params,
                    train_set = train_set,
                    valid_sets = [train_set, test_set],
                    keep_training_booster = True,
                    init_model = lgbm_regress
                    )
            if(last_iter): 
                if(mode == "defog"):
                    defog_regressors.append(lgbm_regress)
                elif mode == "tdcsfog" :
                    tdcsfog_regressors.append(lgbm_regress)
                else: print("Error mode must be in 'defog' or 'tdcsfog' ")
                
            del train_set
            del test_set
#Train two models
train_loop("tdcsfog")
train_loop("defog")




In [None]:

submissions = []
out_cols = ['Id', 'StartHesitation', 'Turn' , 'Walking']
defog_test_inputs = glob.glob(base_path+'test/defog/**')
tdcsfog_test_inputs = glob.glob(base_path+'test/tdcsfog/**')

def regressor_mean(regressor):
    pred_vals = []
    for i, reg in enumerate(regressor):
        y_pred = reg.predict(df[cols])[:,1:]
        y_pred = np.round(y_pred.clip(0.0,1.0),3)
        pred_vals.append(np.expand_dims(y_pred,axis=2))
    return np.mean(np.concatenate(pred_vals,axis=2),axis=2)

for file in defog_test_inputs: 
    df = pd.read_csv(file)
    df.set_index('Time', drop=True, inplace=True)
    df = format_input(df,file, "defog")
    cols = df.columns.difference(['Id', 'Subject', 'Set', 'Time', 'StartHesitation', 'Turn', 'Walking', 'Valid', 'Task', 'Event', "event", "batch_id"]).tolist()
    combine_res = pd.DataFrame( regressor_mean(defog_regressors) , columns=pred_cols)
    df = pd.concat([df,combine_res], axis=1)
    df['Id'] = df['Id'].astype(str) + '_' + df.index.astype(str)
    submissions.append(df[out_cols])
for file in tdcsfog_test_inputs: 
    df = pd.read_csv(file)
    df.set_index('Time', drop=True, inplace=True)
    df = format_input(df,file, "tdcsfog")
    cols = df.columns.difference(['Id', 'Subject', 'Set', 'Time', 'StartHesitation', 'Turn', 'Walking', 'Valid', 'Task', 'Event', "event", "batch_id"]).tolist()
    combine_res = pd.DataFrame( regressor_mean(tdcsfog_regressors) , columns=pred_cols)
    df = pd.concat([df,combine_res], axis=1)
    df['Id'] = df['Id'].astype(str) + '_' + df.index.astype(str)
    submissions.append(df[out_cols])
sub['t'] = 0
submissions = pd.concat(submissions)
submissions = pd.merge(sub[['Id']], submissions, how='left', on='Id').fillna(0.0)
submissions[out_cols].to_csv('submission.csv', index=False)
submissions