In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/public_timeseries_testing_util.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/__init__.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide_310/competition.cpython-310-x86_64-linux-gnu.so
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide_310/__init__.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv
/kaggle/input/amp-parkinsons-disease-progression-pre

# Data preparation (as in project1)

In [2]:
train_proteins = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv")
train_peptides = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv")
train_clinical = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv")

In [3]:
train_clinical['on_medication'] = train_clinical['upd23b_clinical_state_on_medication'].apply(lambda x: 1 if x=='On' else 0)

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
def prepare_dataset(train_clinical, train_proteins, train_peptides):
        
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].mean().reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].mean().reset_index()
    
    df_protein = df_protein_grouped.pivot(index='visit_id',columns = 'UniProt', values = 'NPX').rename_axis(columns=None).reset_index()
    df_peptide = df_peptide_grouped.pivot(index='visit_id',columns = 'Peptide', values = 'PeptideAbundance').rename_axis(columns=None).reset_index()
    
    df = train_clinical[['visit_id', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4', 'visit_month', 'on_medication']]
    df = df.merge(df_protein, on=['visit_id'], how='left')
    df = df.merge(df_peptide, on=['visit_id'], how='left')
    
    return df

In [6]:
df = prepare_dataset(train_clinical, train_proteins, train_peptides)   
df

Unnamed: 0,visit_id,updrs_1,updrs_2,updrs_3,updrs_4,visit_month,on_medication,O00391,O00533,O00584,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55_0,10.0,6.0,15.0,,0,0,11254.3,732430.0,39585.8,...,201158.0,16492.3,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55_3,10.0,7.0,25.0,,3,0,,,,...,,,,,,,,,,
2,55_6,8.0,10.0,34.0,,6,0,13163.6,630465.0,35220.8,...,171079.0,13198.8,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
3,55_9,8.0,9.0,30.0,0.0,9,1,,,,...,,,,,,,,,,
4,55_12,10.0,10.0,41.0,0.0,12,1,15257.6,815083.0,41650.9,...,231772.0,17873.8,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,65043_48,7.0,6.0,13.0,0.0,48,0,10589.6,902434.0,44890.8,...,233567.0,14478.3,3185530.0,48793.0,501159.0,133992.0,170146.0,359045.0,45780.0,17370.6
2611,65043_54,4.0,8.0,11.0,1.0,54,0,,,,...,,,,,,,,,,
2612,65043_60,6.0,6.0,16.0,1.0,60,0,,,,...,,,,,,,,,,
2613,65043_72,3.0,9.0,14.0,1.0,72,0,,,,...,,,,,,,,,,


In [7]:
def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [8]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler

In [9]:
def feature_engineering(dataset):
    
    df = dataset.copy()
    
    df.dropna(thresh = 610, inplace = True)   # min about half of proteines or peptides
    
    features = df.columns[5:]            # leave nulls in updrs for now
    for feat in features:
        
        df[feat].fillna(0, inplace = True)
        
        scaler = MinMaxScaler()
        df[feat] = scaler.fit_transform(df[[feat]])
        
    return df

### first difference: features were normalized

In [10]:
df_eng = feature_engineering(df)
df_eng

Unnamed: 0,visit_id,updrs_1,updrs_2,updrs_3,updrs_4,visit_month,on_medication,O00391,O00533,O00584,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55_0,10.0,6.0,15.0,,0.000000,0.0,0.526842,0.385009,0.597500,...,0.490702,0.596025,0.275001,0.149952,0.145724,0.521437,0.627691,0.461090,0.431722,0.212771
2,55_6,8.0,10.0,34.0,,0.055556,0.0,0.616221,0.326652,0.531615,...,0.417328,0.476999,0.297320,0.159057,0.129209,0.411536,0.547289,0.482796,0.373510,0.295682
4,55_12,10.0,10.0,41.0,0.0,0.111111,1.0,0.714247,0.432313,0.628670,...,0.565382,0.645951,0.395088,0.163127,0.178637,0.544449,0.687913,0.476851,0.510400,0.311923
8,55_36,17.0,18.0,51.0,0.0,0.333333,1.0,0.633411,0.397258,0.649771,...,0.451994,0.671491,0.191957,0.127567,0.170442,0.511251,0.770861,0.525741,0.492377,0.199565
15,942_6,8.0,2.0,21.0,,0.055556,0.0,0.525176,0.194476,0.310645,...,0.552068,0.231286,0.000000,0.080762,0.120699,0.318063,0.301494,0.604482,0.447732,0.223849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2598,64674_84,11.0,15.0,45.0,4.0,0.777778,0.0,0.000000,0.074842,0.375955,...,0.496471,0.138616,0.353738,0.056569,0.084228,0.195806,0.242507,0.704325,0.358818,0.313465
2600,65043_0,2.0,6.0,16.0,,0.000000,0.0,0.630677,0.496912,0.643924,...,0.627803,0.661954,0.181492,0.072167,0.133070,0.620803,0.596267,0.354934,0.451625,0.155894
2604,65043_12,4.0,7.0,14.0,0.0,0.111111,0.0,0.661690,0.529361,0.437581,...,0.562125,0.603646,0.179103,0.062292,0.136369,0.635433,0.610115,0.348304,0.423131,0.271679
2606,65043_24,4.0,8.0,,0.0,0.222222,0.0,0.686248,0.573641,0.700962,...,0.612842,0.662301,0.212151,0.070966,0.150041,0.588536,0.729900,0.409235,0.607172,0.296913


In [11]:
def divide_dataset(df):   # drop nulls of y in divided datasets
    
    df1 = df.copy()
    df1.dropna(subset=['updrs_1'], inplace = True)   
    df1.drop(columns=['visit_id','updrs_2', 'updrs_3', 'updrs_4'], inplace = True)
    
    df2 = df.copy()
    df2.dropna(subset=['updrs_2'], inplace = True)
    df2.drop(columns=['visit_id','updrs_1', 'updrs_3', 'updrs_4'], inplace = True)
    
    df3 = df.copy()
    df3.dropna(subset=['updrs_3'], inplace = True)
    df3.drop(columns=['visit_id','updrs_1', 'updrs_2', 'updrs_4'], inplace = True)
    
    df4 = df.copy()
    df4.dropna(subset=['updrs_4'], inplace = True)
    df4.drop(columns=['visit_id','updrs_1', 'updrs_2', 'updrs_3'], inplace = True)
    
    return df1, df2, df3, df4

# Training and prediction

In [12]:
def get_training_subset(df):
    
    X = df.iloc[:, 1:]
    y = df.iloc[:, 0]
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=42)
    
    return X_train, X_val, y_train, y_val

In [13]:
def get_training_subsets(df1, df2, df3, df4):
        
    dataset = []
    
    data = get_training_subset(df1)
    dataset.append((data))
    
    data = get_training_subset(df2)
    dataset.append((data))
    
    data = get_training_subset(df3)
    dataset.append((data))
    
    data = get_training_subset(df4)
    dataset.append((data))

    return dataset

In [14]:
def predict_label(Xt, Xv, yt, yv):
    lgbm_model = lgb.LGBMRegressor(metric = 'mse', early_stopping_round = 10, random_state = 42, verbose= -100)    
    lgbm_model.fit(Xt, yt, eval_set = (Xv, yv), verbose= False)
    yp = lgbm_model.predict(Xv)
    smape_score = smape(yv, yp)
    return smape_score

In [15]:
def predict_labels(data):
    
    X1_train, X1_val, y1_train, y1_val = data[0]
    X2_train, X2_val, y2_train, y2_val = data[1]
    X3_train, X3_val, y3_train, y3_val = data[2]
    X4_train, X4_val, y4_train, y4_val = data[3]
    
    smape_score = []
    
    print("\nPREDICTION OF LABELS")
    print("=======================================")
    
    smape = predict_label(X1_train, X1_val, y1_train, y1_val)
    smape_score.append(smape)
    
    smape = predict_label(X2_train, X2_val, y2_train, y2_val)
    smape_score.append(smape)
    
    smape = predict_label(X3_train, X3_val, y3_train, y3_val)
    smape_score.append(smape)
    
    smape = predict_label(X4_train, X4_val, y4_train, y4_val)
    smape_score.append(smape)

    for ind, smape in enumerate(smape_score):
        print(f"sMAPE for label {ind}: {smape}")
    
    y_lengths = np.array([len(data[0][2]), len(data[1][2]), len(data[2][2]), len(data[3][2])])
    weights = y_lengths / sum(y_lengths)
    smape_score_wav = sum(weights * np.array(smape_score))
    
    print(f"\nsMAPE weighted average of labels: {smape_score_wav:.4f}")

In [16]:
df1, df2, df3, df4 = divide_dataset(df_eng)         # separate datasets for labels with features
subsets = get_training_subsets(df1, df2, df3, df4)  # train and validation for each label
predict_labels(subsets)                             # result of feature engineering


PREDICTION OF LABELS
sMAPE for label 0: 60.92453954800605
sMAPE for label 1: 96.86446697209053
sMAPE for label 2: 83.48040705329346
sMAPE for label 3: 149.0455723512696

sMAPE weighted average of labels: 90.7826


### note: sMAPE a little bit worse than without normalization (90.78 vs 90.52)

# Reduction of dimensions

In [17]:
from sklearn.manifold import LocallyLinearEmbedding

In [18]:
def reduce_features(dataset, components):
    labels = dataset.iloc[:,:5].reset_index(drop=True)
    features = dataset.iloc[:,5:].reset_index(drop=True)
    
    embedding = LocallyLinearEmbedding(n_components=components)
    red_features = embedding.fit_transform(features)
    X_features = pd.DataFrame(red_features, columns=['feature_'+str(f+1) for f in range(components)])
    
    df = pd.concat([labels,X_features], axis=1)
    return df

### this time it bases on sklearn model, not on order of correlations

In [19]:
df_red = reduce_features(df_eng, 200)
df_red

Unnamed: 0,visit_id,updrs_1,updrs_2,updrs_3,updrs_4,feature_1,feature_2,feature_3,feature_4,feature_5,...,feature_191,feature_192,feature_193,feature_194,feature_195,feature_196,feature_197,feature_198,feature_199,feature_200
0,55_0,10.0,6.0,15.0,,-0.029483,0.007609,-0.005253,-0.036143,0.021358,...,-0.006273,-0.014014,0.013008,0.014814,0.001044,-0.004016,-0.005724,0.000227,-0.000623,-0.008016
1,55_6,8.0,10.0,34.0,,-0.029423,0.008061,-0.005163,-0.037161,0.023864,...,-0.003533,-0.001917,0.009694,0.000641,-0.011341,0.002867,0.005759,-0.006772,0.015998,0.018127
2,55_12,10.0,10.0,41.0,0.0,-0.029492,0.007535,-0.005271,-0.035975,0.020797,...,-0.004869,-0.013856,0.010170,0.015864,0.004366,-0.005416,-0.007294,0.003828,-0.006994,-0.013018
3,55_36,17.0,18.0,51.0,0.0,-0.029467,0.007738,-0.005226,-0.036462,0.020147,...,-0.006478,-0.016727,0.016060,0.019224,0.004332,-0.005924,-0.010024,0.003234,-0.006795,-0.015854
4,942_6,8.0,2.0,21.0,,-0.029298,0.008994,-0.004982,-0.010041,-0.013215,...,-0.025436,-0.024216,-0.014347,-0.009203,-0.018215,0.025046,-0.009078,0.006711,-0.005357,-0.005679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1062,64674_84,11.0,15.0,45.0,4.0,-0.029298,0.008995,-0.004982,0.064592,0.011071,...,-0.015384,-0.047683,-0.030049,-0.012209,-0.010666,0.085271,-0.008025,0.035007,0.026386,0.016849
1063,65043_0,2.0,6.0,16.0,,-0.029282,0.006466,-0.006706,-0.032185,0.009685,...,0.015835,0.023505,-0.013964,-0.011779,0.008268,0.010167,0.026834,0.004998,0.009954,0.013929
1064,65043_12,4.0,7.0,14.0,0.0,-0.029638,0.006221,-0.005633,-0.032786,0.015199,...,0.029906,0.038240,-0.069229,-0.034061,0.007724,0.009131,0.026117,0.012401,-0.018738,0.012964
1065,65043_24,4.0,8.0,,0.0,-0.030090,0.003989,-0.005537,-0.032729,0.014524,...,0.027700,0.035792,-0.066375,-0.034907,0.004785,0.007392,0.022921,0.012363,-0.020963,0.015626


In [20]:
df1, df2, df3, df4 = divide_dataset(df_red)         # separate datasets for labels with features
subsets = get_training_subsets(df1, df2, df3, df4)  # train and validation for each label
predict_labels(subsets)                             # result of feature engineering


PREDICTION OF LABELS
sMAPE for label 0: 57.526158878297615
sMAPE for label 1: 93.38772006842035
sMAPE for label 2: 78.00320495725616
sMAPE for label 3: 143.52478056834542

sMAPE weighted average of labels: 86.4567


### with number of components 200 sMAPE improved notticeably (from 90.78 to 86.46)

# Clustering

In [21]:
from sklearn.cluster import KMeans

In [22]:
def predict_clustered_label(df, clust_nr, verbose = True):
    
    model_clust_km = KMeans(n_clusters = clust_nr, init='k-means++', n_init='auto', random_state=42)
    y_clust = model_clust_km.fit_predict(df.iloc[:,1:])
    
    df_clust = df.copy()
    df_clust['cluster'] = y_clust   # for validation calculated not predicted yet
    
    cl_numbers = y_clust.tolist()
    clusters = np.unique(cl_numbers)
    
    cl_entries = []
    smape_score = []
    
    for cluster in clusters:
    
        df_cl = df_clust[df_clust['cluster'] == cluster].drop(columns=['cluster'])
        
        if len(df_cl)>=8:
            X_train, X_val, y_train, y_val = get_training_subset(df_cl)
            smape = predict_label(X_train, X_val, y_train, y_val)
        
            entries = cl_numbers.count(cluster)
            cl_entries.append(entries)
            smape_score.append(smape)
        
        else:
            cl_entries.append(0)
            smape_score.append(0)
    
    weights = np.array(cl_entries) / sum(cl_entries)
    smape_score_wav = sum(weights * np.array(smape_score))
    
    if verbose:
        for ind, cluster in enumerate(clusters):
            print(f"CLUSTER {cluster}:sMAPE: {smape_score[ind]:.4f}, entries: {cl_entries[ind]}")
    
    print(f"sMAPE weighted average in clusters: {smape_score_wav:.4f}, entries: {sum(cl_entries)}")
    
    return smape_score_wav

In [23]:
def predict_clustered_labels(df1, df2, df3, df4, clust_nr_list):
    
    dfs = (df1.copy(), df2.copy(), df3.copy(), df4.copy())
    smape_score = []
    
    print("\nPREDICTION OF CLUSTERED LABELS")
    print("=================================================")
    
    for ind, df in enumerate(dfs):
        print(f"\nLABEL: {df.columns[0]}")
        smape = predict_clustered_label(df, clust_nr_list[ind])
        smape_score.append(smape)
        print("------------------------------------------------")
            
    y_lengths = np.array([len(dfs[0]), len(dfs[1]), len(dfs[2]), len(dfs[3])])
    weights = y_lengths / sum(y_lengths)
    smape_score_wav = sum(weights * np.array(smape_score))
    
    print(f"\nsMAPE weighted average of labels: {smape_score_wav:.4f}")

In [24]:
predict_clustered_labels(df1, df2, df3, df4, [3, 3, 3, 3])     # random number of clusters yet


PREDICTION OF CLUSTERED LABELS

LABEL: updrs_1
CLUSTER 0:sMAPE: 0.0000, entries: 0
CLUSTER 1:sMAPE: 59.8836, entries: 1043
CLUSTER 2:sMAPE: 120.7051, entries: 21
sMAPE weighted average in clusters: 61.0841, entries: 1064
------------------------------------------------

LABEL: updrs_2
CLUSTER 0:sMAPE: 0.0000, entries: 0
CLUSTER 1:sMAPE: 86.5001, entries: 1043
CLUSTER 2:sMAPE: 178.1818, entries: 21
sMAPE weighted average in clusters: 88.3096, entries: 1064
------------------------------------------------

LABEL: updrs_3
CLUSTER 0:sMAPE: 0.0000, entries: 0
CLUSTER 1:sMAPE: 72.6375, entries: 1045
CLUSTER 2:sMAPE: 32.1952, entries: 8
sMAPE weighted average in clusters: 72.3302, entries: 1053
------------------------------------------------

LABEL: updrs_4
CLUSTER 0:sMAPE: 144.4871, entries: 555
CLUSTER 1:sMAPE: 0.0000, entries: 0
CLUSTER 2:sMAPE: 0.0000, entries: 0
sMAPE weighted average in clusters: 144.4871, entries: 555
------------------------------------------------

sMAPE weighted a

### random number of clusters (3) improved sMAPE a bit (from 86.46 to 84.58)

In [25]:
def search_clusters_for_label(dataset, max_clust):
    
    df = dataset.copy()
    min_smape_clus = 0
    min_smape = 200
    
    Xt, Xv, yt, yv = get_training_subset(df)
    smape_lab = predict_label(Xt, Xv, yt, yv)
        
    print(f"\nsMAPE before clustering: {smape_lab}")
    if smape_lab < min_smape:
        min_smape = smape_lab
        min_smape_clus = 1
        
    clust_numbers = np.arange(2, max_clust+1)
    for clust_nr in clust_numbers:
        print(f"number of clusters: {clust_nr}")
        smape_clust = predict_clustered_label(df, clust_nr, verbose = False)
                
        if smape_clust < min_smape:
            min_smape = smape_clust
            min_smape_clus = clust_nr
        
    return min_smape, min_smape_clus

In [26]:
def search_clusters_for_labels(df1, df2, df3, df4, max_clust):
    
    label1 = df1.columns[0]
    label2 = df2.columns[0]
    label3 = df3.columns[0]
    label4 = df4.columns[0]
    
    print(f"\nSearching clusters for label {label1}")
    min_smape1, min_smape_clus1 = search_clusters_for_label(df1, max_clust)
    
    print("---------------------------------------")
    print(f"\nSearching clusters for label {label2}")
    min_smape2, min_smape_clus2 = search_clusters_for_label(df2, max_clust)
    
    print("---------------------------------------")
    print(f"\nSearching clusters for label {label3}")
    min_smape3, min_smape_clus3 = search_clusters_for_label(df3, max_clust)
    
    print("---------------------------------------")
    print(f"\nSearching clusters for label {label4}")
    min_smape4, min_smape_clus4 = search_clusters_for_label(df4, max_clust)
    
    print("\n\nSUMMARY")
    print("==================================================")
    print(f"\nLABEL {label1}")
    print(f"Min sMAPE: {min_smape1} for clusters number: {min_smape_clus1}")
    print(f"\nLABEL {label2}")
    print(f"Min sMAPE: {min_smape2} for clusters number: {min_smape_clus2}")
    print(f"\nLABEL {label3}")
    print(f"Min sMAPE: {min_smape3} for clusters number: {min_smape_clus3}")
    print(f"\nLABEL {label4}")
    print(f"Min sMAPE: {min_smape4} for clusters number: {min_smape_clus4}")
    
    y_lengths = np.array([len(df1), len(df2), len(df3), len(df4)])
    weights = y_lengths / sum(y_lengths)
    smape_score_wav = sum(weights * np.array([min_smape1, min_smape2, min_smape3, min_smape4]))
    
    print(f"\nsMAPE weighted average of labels: {smape_score_wav:.4f}")

In [27]:
search_clusters_for_labels(df1, df2, df3, df4, 10)


Searching clusters for label updrs_1

sMAPE before clustering: 57.526158878297615
number of clusters: 2
sMAPE weighted average in clusters: 55.4739, entries: 1064
number of clusters: 3
sMAPE weighted average in clusters: 61.0841, entries: 1064
number of clusters: 4
sMAPE weighted average in clusters: 62.1306, entries: 1064
number of clusters: 5
sMAPE weighted average in clusters: 57.9692, entries: 1064
number of clusters: 6
sMAPE weighted average in clusters: 56.2567, entries: 1057
number of clusters: 7
sMAPE weighted average in clusters: 57.6901, entries: 1051
number of clusters: 8
sMAPE weighted average in clusters: 59.5130, entries: 1058
number of clusters: 9
sMAPE weighted average in clusters: 65.5340, entries: 1058
number of clusters: 10
sMAPE weighted average in clusters: 68.4570, entries: 1058
---------------------------------------

Searching clusters for label updrs_2

sMAPE before clustering: 93.38772006842035
number of clusters: 2
sMAPE weighted average in clusters: 94.9638

### finding optimal number of clusters for each label (2, 10, 9, 8) caused further improvement of sMAPE (from 84.58 to 80.13)

# Prediction of assignment to clusters (calulated for training dataset) for validation data

In [28]:
def get_cluster_model(Xt, Xv, yt, yv):
    lgbm_model = lgb.LGBMClassifier(random_state = 42, verbose= -100)    
    lgbm_model.fit(Xt, yt, eval_set = (Xv, yv), verbose= False)
    return lgbm_model

In [29]:
def predict_clustered_label_2(df, clust_nr, verbose = True):
    
    # on the basis of clusters calculated for X_train
    # we have to predict to which cluster each x_val belongs
    
    X_train, X_val, y_train, y_val = get_training_subset(df)
    
    model_clust_km = KMeans(n_clusters = clust_nr, init='k-means++', n_init='auto', random_state=42)
    clust_calc = model_clust_km.fit_predict(X_train)   
    df_train = X_train.copy()
    df_train.insert(loc=0, column='cluster', value=clust_calc)   # new label, i.e. cluster nr, should be in first column before getting subsets

    X_train_cl, X_val_cl, y_train_cl, y_val_cl = get_training_subset(df_train)
    clust_model = get_cluster_model(X_train_cl, X_val_cl, y_train_cl, y_val_cl)
    clust_pred = clust_model.predict(X_val)
    df_val = X_val.copy()
    df_val.insert(loc=0, column='cluster', value=clust_pred)
    
    df_train.insert(loc=0, column='y', value = y_train)
    df_val.insert(loc=0, column='y', value = y_val)
                  
    cl_numbers = clust_calc.tolist()
    clusters = np.unique(cl_numbers)
    
    cl_entries = []
    smape_score = []
    
    for cluster in clusters:
    
        df_cl_train = df_train[df_train['cluster'] == cluster].drop(columns=['cluster'])
        df_cl_val = df_val[df_val['cluster'] == cluster].drop(columns=['cluster'])
        
        if len(df_cl_val)>=2:
            smape = predict_label(df_cl_train.iloc[:,1:], df_cl_val.iloc[:,1:], df_cl_train.iloc[:,0], df_cl_val.iloc[:,0])
        
            entries = cl_numbers.count(cluster)
            cl_entries.append(entries)
            smape_score.append(smape)
        
        else:
            cl_entries.append(0)
            smape_score.append(0)
    
    weights = np.array(cl_entries) / sum(cl_entries)
    smape_score_wav = sum(weights * np.array(smape_score))
    
    if verbose:
        for ind, cluster in enumerate(clusters):
            print(f"CLUSTER {cluster}:sMAPE: {smape_score[ind]:.4f}, entries: {cl_entries[ind]}")
    
    print(f"sMAPE weighted average in clusters: {smape_score_wav:.4f}, entries: {sum(cl_entries)}")
    
    return smape_score_wav

In [30]:
def predict_clustered_labels_2(df1, df2, df3, df4, clust_nr_list):
    
    dfs = (df1.copy(), df2.copy(), df3.copy(), df4.copy())
    smape_score = []
    
    print("\nPREDICTION OF CLUSTERED LABELS")
    print("=================================================")
    
    for ind, df in enumerate(dfs):
        print(f"\nLABEL: {df.columns[0]}")
        smape = predict_clustered_label_2(df, clust_nr_list[ind])
        smape_score.append(smape)
        print("------------------------------------------------")
            
    y_lengths = np.array([len(dfs[0]), len(dfs[1]), len(dfs[2]), len(dfs[3])])
    weights = y_lengths / sum(y_lengths)
    smape_score_wav = sum(weights * np.array(smape_score))
    
    print(f"\nsMAPE weighted average of labels: {smape_score_wav:.4f}")

In [31]:
predict_clustered_labels_2(df1, df2, df3, df4, [3, 3, 3, 3])


PREDICTION OF CLUSTERED LABELS

LABEL: updrs_1
CLUSTER 0:sMAPE: 0.0000, entries: 0
CLUSTER 1:sMAPE: 8.4087, entries: 5
CLUSTER 2:sMAPE: 56.1417, entries: 844
sMAPE weighted average in clusters: 55.8606, entries: 849
------------------------------------------------

LABEL: updrs_2
CLUSTER 0:sMAPE: 0.0000, entries: 0
CLUSTER 1:sMAPE: 0.0000, entries: 5
CLUSTER 2:sMAPE: 93.8219, entries: 844
sMAPE weighted average in clusters: 93.2694, entries: 849
------------------------------------------------

LABEL: updrs_3
CLUSTER 0:sMAPE: 0.0000, entries: 0
CLUSTER 1:sMAPE: 0.0000, entries: 0
CLUSTER 2:sMAPE: 77.1141, entries: 823
sMAPE weighted average in clusters: 77.1141, entries: 823
------------------------------------------------

LABEL: updrs_4
CLUSTER 0:sMAPE: 0.0000, entries: 0
CLUSTER 1:sMAPE: 0.0000, entries: 0
CLUSTER 2:sMAPE: 145.7780, entries: 448
sMAPE weighted average in clusters: 145.7780, entries: 448
------------------------------------------------

sMAPE weighted average of lab

In [33]:
predict_clustered_labels_2(df1, df2, df3, df4, [3, 10, 9, 8])


PREDICTION OF CLUSTERED LABELS

LABEL: updrs_1
CLUSTER 0:sMAPE: 0.0000, entries: 0
CLUSTER 1:sMAPE: 8.4087, entries: 5
CLUSTER 2:sMAPE: 56.1417, entries: 844
sMAPE weighted average in clusters: 55.8606, entries: 849
------------------------------------------------

LABEL: updrs_2
CLUSTER 0:sMAPE: 0.0000, entries: 0
CLUSTER 1:sMAPE: 0.0000, entries: 0
CLUSTER 2:sMAPE: 70.9179, entries: 13
CLUSTER 3:sMAPE: 101.7297, entries: 11
CLUSTER 4:sMAPE: 0.0000, entries: 0
CLUSTER 5:sMAPE: 0.0000, entries: 0
CLUSTER 6:sMAPE: 0.0000, entries: 0
CLUSTER 7:sMAPE: 0.0000, entries: 0
CLUSTER 8:sMAPE: 0.0000, entries: 0
CLUSTER 9:sMAPE: 93.6238, entries: 775
sMAPE weighted average in clusters: 93.3659, entries: 799
------------------------------------------------

LABEL: updrs_3
CLUSTER 0:sMAPE: 0.0000, entries: 0
CLUSTER 1:sMAPE: 0.0000, entries: 0
CLUSTER 2:sMAPE: 0.0000, entries: 0
CLUSTER 3:sMAPE: 0.0000, entries: 0
CLUSTER 4:sMAPE: 77.6884, entries: 773
CLUSTER 5:sMAPE: 0.0000, entries: 0
CLUSTER 

# Conclusion:
### * dimensionality reduction using model LocallyLinearEmbedding gave better results before clusterization than in project 1
### * improvement of sMAPE due to clusterization was smaller than in project 1, however final result was a little bit better
### * prediction for evaluation data, related to clusters membership (instead of its re-calculation for merged features), introduced an error of prediction, which affected final sMAPE; it removed a positive impact of clusterization to the final score.