In [1]:
import mat73
import pandas as pd
import numpy as np
import scipy.io
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

### Data loading and preprocessing

In [3]:
import os

DATA_DIR = ''
if 'google.colab' not in str(get_ipython()) and "anuja" in os.environ.get('USER'):
    DATA_DIR = 'data/'
    

In [4]:
foof = pd.read_csv(DATA_DIR+"foof2features.csv")
foof = foof.rename(columns={"C1": "IDs" ,"C2": "Intercept", "C3": "Slope"})
foof

Unnamed: 0,IDs,Intercept,Slope
0,NDARAA075AMK,0.986272,1.825774
1,NDARAA112DMH,1.486650,1.888544
2,NDARAA117NEJ,1.593155,2.095749
3,NDARAA947ZG5,0.703331,1.724831
4,NDARAA948VFH,0.918020,1.749441
...,...,...,...
2037,NDARZN277NR6,1.351549,1.996940
2038,NDARZN578YDP,1.380795,2.036327
2039,NDARZN610GTY,0.339229,1.050644
2040,NDARZN677EYE,0.781225,1.470061


In [5]:
data = mat73.loadmat(DATA_DIR+'x_source.mat')  
df2 = pd.DataFrame(data['x'].reshape((data['x'].shape[0], -1)))

# sparsing
df2 = np.array(df2).reshape(data['x'].shape) 
df2_sparsed = np.concatenate([np.expand_dims(df2[:,:,i:i+10].mean(axis = 2), axis = 2) for i in range(0, data['x'].shape[2]-10, 10)], axis = 2)
df2 = pd.DataFrame(df2_sparsed.reshape((df2_sparsed.shape[0], -1)))

#scaling
norm = MinMaxScaler().fit(df2)
df2 = norm.transform(df2)
df2 = pd.DataFrame(df2.reshape((df2.shape[0], -1)))

df2['IDs'] = foof['IDs']
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2643,2644,2645,2646,2647,2648,2649,2650,2651,IDs
0,3.497571e-01,4.585819e-01,2.938905e-01,3.770603e-01,0.327764,0.552059,0.462017,0.695207,0.642190,0.584085,...,0.337358,0.439658,0.437051,0.364673,0.405482,0.326733,0.428504,0.250803,0.179322,NDARAA075AMK
1,7.464767e-01,9.048637e-01,5.054438e-01,7.021925e-01,0.498947,0.342338,0.283478,0.221679,0.178758,0.178338,...,0.430404,0.296341,0.321952,0.131375,0.579138,0.478879,0.304577,0.360859,0.448476,NDARAA112DMH
2,2.204578e-01,3.562272e-01,3.895027e-01,4.181956e-01,0.495129,0.436858,0.392300,0.446757,0.528172,0.409405,...,0.404292,0.339923,0.505282,0.340129,0.203517,0.174711,0.341549,0.693619,0.545169,NDARAA117NEJ
3,6.346560e-02,1.204557e-01,1.650503e-01,7.631559e-01,0.523793,0.399243,0.267432,0.301397,0.283535,0.363612,...,0.108138,0.049740,0.033945,0.304668,0.111156,0.008791,0.005297,0.266759,0.339304,NDARAA947ZG5
4,2.758219e-01,2.255777e-01,4.828928e-01,4.704279e-01,0.330692,0.310594,0.284202,0.298111,0.245370,0.250187,...,0.164094,0.161737,0.123381,0.079693,0.069366,0.043857,0.029212,0.021894,0.013197,NDARAA948VFH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2036,3.394601e-01,4.781692e-01,5.423877e-01,4.137127e-01,0.379793,0.413474,0.538537,0.457002,0.286496,0.222562,...,0.268739,0.328251,0.327389,0.292491,0.367808,0.346005,0.348895,0.399141,0.367943,NDARZN148PMN
2037,1.072605e-01,2.045245e-01,2.610531e-01,3.236518e-01,0.437417,0.432447,0.412152,0.445771,0.374603,0.386189,...,0.451919,0.240030,0.125819,0.083812,0.289439,0.378008,0.264043,0.502426,0.008403,NDARZN277NR6
2038,2.801818e-01,3.838379e-01,3.575040e-01,3.295642e-01,0.339077,0.263007,0.204378,0.308062,0.413562,0.319891,...,0.386920,0.543971,0.590438,0.544979,0.679570,0.616411,0.589517,0.634712,0.549726,NDARZN578YDP
2039,3.260273e-15,3.373362e-14,2.304057e-13,1.498324e-12,0.000009,0.142998,0.689837,0.800874,0.499946,0.248208,...,0.213887,0.230102,0.189709,0.132429,0.124579,0.085125,0.061278,0.049636,0.032335,NDARZN610GTY


In [6]:
beh = pd.read_csv(DATA_DIR+"behaviorals.csv")
print('Before:'+str(beh.shape))

most_common_disorders = ['Attention-Deficit/Hyperactivity Disorder', 'Anxiety Disorders', 'Specific Learning Disorder',
                         'Autism Spectrum Disorder', 'Disruptive', 'Communication Disorder',
                         'Depressive Disorders',  'No Diagnosis Given', 'Other Disorders']

category_columns = ['DX_' + str(i).zfill(2) + '_Cat' for i in range(1, 11)] +\
                   ['DX_' + str(i).zfill(2) + '_Sub' for i in range(1, 11)]

# removing patients with incomplete eval
initial_size = beh.shape[0]
beh = beh[beh.DX_01 != 'No Diagnosis Given: Incomplete Eval']
beh = beh.reset_index(drop=True)
new_size = beh.shape[0]

print('After:'+str(beh.shape))
print('Removing', initial_size - new_size,
      'patients as their evaluations was incomplete.')

Before:(3076, 177)
After:(2939, 177)
Removing 137 patients as their evaluations was incomplete.


In [7]:
no_diagnosis_given = 'No Diagnosis Given'

diagnoses_to_ids = {disorder: i for i, disorder in enumerate(most_common_disorders)}
diagnoses_to_ids

{'Attention-Deficit/Hyperactivity Disorder': 0,
 'Anxiety Disorders': 1,
 'Specific Learning Disorder': 2,
 'Autism Spectrum Disorder': 3,
 'Disruptive': 4,
 'Communication Disorder': 5,
 'Depressive Disorders': 6,
 'No Diagnosis Given': 7,
 'Other Disorders': 8}

In [8]:
def get_disorder(data, row, index):
    disorder = data.iloc[row][category_columns[index]]

    if disorder == 'Neurodevelopmental Disorders':
        disorder = data.iloc[row][category_columns[index + 10]]

    return disorder

order_of_disorders = []
for k in range(beh.shape[0]):
    i = 0
    disorder = get_disorder(beh, k, i)
    disorders_patient = []
    while not pd.isnull(disorder):
        if disorder in diagnoses_to_ids:
            if diagnoses_to_ids[disorder] not in disorders_patient:
                disorders_patient.append(diagnoses_to_ids[disorder])
        else:
            if diagnoses_to_ids['Other Disorders'] not in disorders_patient:
                disorders_patient.append(diagnoses_to_ids['Other Disorders'])
        i += 1
        if i == len(category_columns):
            break
        disorder = get_disorder(beh, k, i)

        
    order_of_disorders.append(disorders_patient)

In [9]:
other_disorders = []
no_diagnosis_given = []
for i in order_of_disorders:
    if 7 in i:
        no_diagnosis_given.append(1)
        i.remove(7)
    else:
        no_diagnosis_given.append(0)
    if 8 in i:
        other_disorders.append(1)
        i.remove(8)
    else:
        other_disorders.append(0)

In [10]:
max_len_order = np.max([len(x) for x in order_of_disorders])

# pad with a new token denoting the pad token
pad_token = len(most_common_disorders)
bod_token = len(most_common_disorders) + 1
eod_token = len(most_common_disorders) + 2

order_of_disorders = [[bod_token] + x + [eod_token] + [pad_token] * (max_len_order - len(x)) for x in order_of_disorders]

order_of_disorders = np.array(order_of_disorders)

classes = np.zeros((len(most_common_disorders),
                    beh.shape[0]), dtype=np.int32)

df_disorders = beh[category_columns]

for i, disorder in enumerate(most_common_disorders):
    mask = df_disorders.select_dtypes(include=[object]). \
        applymap(lambda x: disorder in x if pd.notnull(x) else False)

    disorder_df = df_disorders[mask.any(axis=1)]

    np.add.at(classes[i], disorder_df.index.values, 1)

behaviour_data_columns = beh.columns.values.astype(np.str)

columns_to_drop = behaviour_data_columns[
    np.flatnonzero(np.core.defchararray.find(behaviour_data_columns, 'DX') != -1)]

behaviour_data = beh.drop(columns=columns_to_drop)

for disorder, classification in zip(most_common_disorders, classes):
    behaviour_data[disorder] = classification

behaviour_data['order_diagnoses'] = list(order_of_disorders)

In [11]:
common_disorders = ['Attention-Deficit/Hyperactivity Disorder', 'Anxiety Disorders', 'Specific Learning Disorder',
                         'Autism Spectrum Disorder', 'Disruptive', 'Communication Disorder',
                         'Depressive Disorders']

labels=behaviour_data[["IDs"]+list(common_disorders)]
labels["Other Disorders"] = other_disorders
labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labels["Other Disorders"] = other_disorders


Unnamed: 0,IDs,Attention-Deficit/Hyperactivity Disorder,Anxiety Disorders,Specific Learning Disorder,Autism Spectrum Disorder,Disruptive,Communication Disorder,Depressive Disorders,Other Disorders
0,NDARAA075AMK,0,0,0,0,0,0,0,0
1,NDARAA112DMH,1,0,0,0,1,0,0,1
2,NDARAA117NEJ,1,0,0,0,1,0,0,1
3,NDARAA306NT2,1,1,1,0,0,1,0,1
4,NDARAA504CRN,1,1,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
2934,NDARZZ007YMP,0,0,0,1,0,0,0,1
2935,NDARZZ740MLM,1,0,0,0,0,0,0,0
2936,NDARZZ810LVF,0,0,0,1,0,1,0,1
2937,NDARZZ830JM7,0,0,0,1,0,0,0,1


In [12]:
df = pd.merge(df2, foof, on='IDs', how='inner')
df = pd.merge(df2, labels, on='IDs', how='inner')
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2651,IDs,Attention-Deficit/Hyperactivity Disorder,Anxiety Disorders,Specific Learning Disorder,Autism Spectrum Disorder,Disruptive,Communication Disorder,Depressive Disorders,Other Disorders
0,3.497571e-01,4.585819e-01,2.938905e-01,3.770603e-01,0.327764,0.552059,0.462017,0.695207,0.642190,0.584085,...,0.179322,NDARAA075AMK,0,0,0,0,0,0,0,0
1,7.464767e-01,9.048637e-01,5.054438e-01,7.021925e-01,0.498947,0.342338,0.283478,0.221679,0.178758,0.178338,...,0.448476,NDARAA112DMH,1,0,0,0,1,0,0,1
2,2.204578e-01,3.562272e-01,3.895027e-01,4.181956e-01,0.495129,0.436858,0.392300,0.446757,0.528172,0.409405,...,0.545169,NDARAA117NEJ,1,0,0,0,1,0,0,1
3,6.346560e-02,1.204557e-01,1.650503e-01,7.631559e-01,0.523793,0.399243,0.267432,0.301397,0.283535,0.363612,...,0.339304,NDARAA947ZG5,1,0,1,1,0,0,0,1
4,2.758219e-01,2.255777e-01,4.828928e-01,4.704279e-01,0.330692,0.310594,0.284202,0.298111,0.245370,0.250187,...,0.013197,NDARAA948VFH,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1915,2.199443e-01,3.397615e-01,3.707636e-01,5.212714e-01,0.838115,0.603350,0.675639,0.628488,0.471081,0.321416,...,0.333760,NDARZM903TNL,0,0,0,0,0,0,0,0
1916,3.394601e-01,4.781692e-01,5.423877e-01,4.137127e-01,0.379793,0.413474,0.538537,0.457002,0.286496,0.222562,...,0.367943,NDARZN148PMN,0,1,0,0,0,0,0,0
1917,1.072605e-01,2.045245e-01,2.610531e-01,3.236518e-01,0.437417,0.432447,0.412152,0.445771,0.374603,0.386189,...,0.008403,NDARZN277NR6,1,1,0,0,0,0,1,0
1918,3.260273e-15,3.373362e-14,2.304057e-13,1.498324e-12,0.000009,0.142998,0.689837,0.800874,0.499946,0.248208,...,0.032335,NDARZN610GTY,0,0,0,0,0,0,0,1


### Data Split

In [13]:
disorders_list = ['Attention-Deficit/Hyperactivity Disorder', 'Anxiety Disorders', 'Specific Learning Disorder',
                         'Autism Spectrum Disorder', 'Disruptive', 'Communication Disorder',
                         'Depressive Disorders', 'Other Disorders']
x = df[df.columns.difference(['IDs']+disorders_list)]
y = df[disorders_list]

# summarize dataset shape
print(x.shape, y.shape)

(1920, 2652) (1920, 8)


## FEATURE EXTRACTION WITH PCA

In [14]:
# dimensionality reduction
from sklearn.decomposition import PCA

pca = PCA(.95) # 95% variance retained
pca.fit(x)

# transform data
x_pca = pca.transform(x)
x_pca.shape



x_pca = x

In [15]:
train_features, test_features, train_labels, test_labels = train_test_split(x, y, test_size=0.25, shuffle=True)

In [16]:
#scaling features

# data normalization with sklearn
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm = MinMaxScaler().fit(x)

# transform training data
x_norm = norm.transform(x)
x_norm = x


In [17]:
train_labels.sum(axis=0)

Attention-Deficit/Hyperactivity Disorder    866
Anxiety Disorders                           478
Specific Learning Disorder                  326
Autism Spectrum Disorder                    224
Disruptive                                  227
Communication Disorder                      206
Depressive Disorders                        149
Other Disorders                             562
dtype: int64

In [18]:
test_labels.sum(axis=0)

Attention-Deficit/Hyperactivity Disorder    299
Anxiety Disorders                           161
Specific Learning Disorder                  113
Autism Spectrum Disorder                     67
Disruptive                                   70
Communication Disorder                       66
Depressive Disorders                         45
Other Disorders                             194
dtype: int64

## FEATURE EXTRACTION WITH AUTOENCODER

In [19]:
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, LeakyReLU, BatchNormalization
from tensorflow.keras.utils import plot_model, to_categorical

n_inputs = train_features.shape[1]

visible = Input(shape=(n_inputs,))
e = Dense(n_inputs*2)(visible)
e = BatchNormalization()(e)
e = LeakyReLU()(e)
e = Dense(n_inputs)(e)
e = BatchNormalization()(e)
e = LeakyReLU()(e)
n_bottleneck = round(float(n_inputs) / 2.0)
bottleneck = Dense(n_bottleneck)(e)

d = Dense(n_inputs)(bottleneck)
d = BatchNormalization()(d)
d = LeakyReLU()(d)

d = Dense(n_inputs*2)(d)
d = BatchNormalization()(d)
d = LeakyReLU()(d)

output = Dense(8, activation='linear')(d)

model = Model(inputs=visible, outputs=output)
model.compile(optimizer='adam', loss='mse')
plot_model(model, 'autoencoder_compress.png', show_shapes=True)

history = model.fit(train_features, train_labels, epochs=50, batch_size=16, verbose=2)
encoder = Model(inputs=visible, outputs=bottleneck)

plot_model(encoder, 'encoder_compress.png', show_shapes=True)
encoder.save('autoencoder.h5')

Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.
Epoch 1/50


2021-11-25 22:52:42.514731: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10


90/90 - 2s - loss: 8.0477
Epoch 2/50
90/90 - 2s - loss: 0.2293
Epoch 3/50
90/90 - 2s - loss: 0.2331
Epoch 4/50
90/90 - 2s - loss: 0.2468
Epoch 5/50
90/90 - 2s - loss: 0.2478
Epoch 6/50
90/90 - 2s - loss: 0.2730
Epoch 7/50
90/90 - 2s - loss: 0.2857
Epoch 8/50
90/90 - 2s - loss: 0.2685
Epoch 9/50
90/90 - 2s - loss: 0.2675
Epoch 10/50
90/90 - 2s - loss: 0.2953
Epoch 11/50
90/90 - 2s - loss: 0.2988
Epoch 12/50
90/90 - 2s - loss: 0.2637
Epoch 13/50
90/90 - 2s - loss: 0.2797
Epoch 14/50
90/90 - 2s - loss: 0.2756
Epoch 15/50
90/90 - 2s - loss: 0.3044
Epoch 16/50
90/90 - 2s - loss: 0.3169
Epoch 17/50
90/90 - 2s - loss: 0.3140
Epoch 18/50
90/90 - 2s - loss: 0.3291
Epoch 19/50
90/90 - 2s - loss: 0.3381
Epoch 20/50
90/90 - 2s - loss: 0.2861
Epoch 21/50
90/90 - 2s - loss: 0.2923
Epoch 22/50
90/90 - 2s - loss: 0.3024
Epoch 23/50
90/90 - 2s - loss: 0.2685
Epoch 24/50
90/90 - 2s - loss: 0.2852
Epoch 25/50
90/90 - 2s - loss: 0.3235
Epoch 26/50
90/90 - 2s - loss: 0.3431
Epoch 27/50
90/90 - 2s - loss: 0

In [20]:
# encode the data
encoder= load_model('autoencoder.h5', compile=False)

train_features = encoder.predict(train_features)
test_features = encoder.predict(test_features)

### Metrics

In [19]:
from sklearn.metrics import hamming_loss, accuracy_score
import sklearn.metrics as skm
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np

def brier_multi(targets, probs):
    return np.mean(np.sum((probs - targets)**2, axis=1))

def evaluate(y_test, y_pred_prob, brier=True):
    y_pred = y_pred_prob.round()
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Hamming Loss:", hamming_loss(y_test, y_pred))
    if brier:
        print("Brier Score:", brier_multi(y_test, y_pred_prob))
    print("Classification Report:\n", skm.classification_report(y_test,y_pred, zero_division=1))
    print("Confusion matrix:\n", skm.multilabel_confusion_matrix(y_test, y_pred))

## Models

### Multi Output Classifier

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC as svm
from sklearn.multioutput import MultiOutputClassifier

forest = RandomForestClassifier(random_state=1)
lg = LogisticRegression()
svm = svm()
models = [lg, forest, svm]

for model in models:

    multi_output_model = MultiOutputClassifier(model, n_jobs=-1)
    multi_output_model.fit(train_features, train_labels)
    predicted_labels = multi_output_model.predict(test_features)
    print(str(model)+':')
    evaluate(test_labels, predicted_labels, brier=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression():
Accuracy: 0.052083333333333336
Hamming Loss: 0.3026041666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.64      0.63       299
           1       0.33      0.27      0.30       161
           2       0.23      0.16      0.19       113
           3       0.07      0.03      0.04        67
           4       0.20      0.13      0.16        70
           5       0.04      0.02      0.02        66
           6       0.14      0.04      0.07        45
           7       0.38      0.34      0.36       194

   micro avg       0.41      0.33      0.36      1015
   macro avg       0.25      0.20      0.22      1015
weighted avg       0.36      0.33      0.34      1015
 samples avg       0.51      0.42      0.32      1015

Confusion matrix:
 [[[ 64 117]
  [109 190]]

 [[229  90]
  [117  44]]

 [[305  62]
  [ 95  18]]

 [[385  28]
  [ 65   2]]

 [[375  35]
  [ 61   9]]

 [[387  27]
  [ 65   1]]

 [[423  12

### MLP

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

def get_mlp(n_inputs, n_outputs):
    model = Sequential()
    model.add(Dense(20, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    model.add(Dense(n_outputs, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

n_inputs, n_outputs = train_features.shape[1],train_labels.shape[1]
mlp = get_mlp(n_inputs, n_outputs)
mlp.fit(train_features, train_labels, verbose=1, epochs=100)

2021-11-25 23:12:39.157673: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2021-11-25 23:12:39.164137: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-25 23:12:39.164430: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:00:06.0 name: Tesla T4 computeCapability: 7.5
coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.75GiB deviceMemoryBandwidth: 298.08GiB/s
2021-11-25 23:12:39.164568: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2021-11-25 23:12:39.166014: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2021-11-25 23:12:39.167650: I tensorflow/stream_executor/platform/default/d

Epoch 1/100
 1/45 [..............................] - ETA: 0s - loss: 0.6745

2021-11-25 23:12:39.987598: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<tensorflow.python.keras.callbacks.History at 0x7f04e6b747f0>

In [22]:
predicted_labels_mlp = mlp.predict(test_features)
evaluate(test_labels, predicted_labels_mlp)

Accuracy: 0.10416666666666667
Hamming Loss: 0.2677083333333333
Brier Score: 1.5939197152667515
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.76      0.68       299
           1       0.40      0.20      0.27       161
           2       0.15      0.04      0.06       113
           3       0.10      0.01      0.03        67
           4       0.15      0.04      0.07        70
           5       0.00      0.00      0.00        66
           6       0.00      0.00      0.00        45
           7       0.38      0.22      0.28       194

   micro avg       0.49      0.31      0.38      1015
   macro avg       0.22      0.16      0.17      1015
weighted avg       0.35      0.31      0.31      1015
 samples avg       0.59      0.40      0.35      1015

Confusion matrix:
 [[[ 41 140]
  [ 73 226]]

 [[270  49]
  [128  33]]

 [[344  23]
  [109   4]]

 [[404   9]
  [ 66   1]]

 [[393  17]
  [ 67   3]]

 [[407   7]
  [ 66   0]]

 

### Binary Relevance
ignores the possible correlations between class labels

In [23]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

classifier = BinaryRelevance(GaussianNB())
classifier.fit(test_features, test_labels)

BinaryRelevance(classifier=GaussianNB(), require_dense=[True, True])

In [24]:
predicted_labels_br = classifier.predict_proba(test_features)
evaluate(test_labels, predicted_labels_br.todense())

Accuracy: 0.19375
Hamming Loss: 0.17890625
Brier Score: 1.3800143446430682
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.77      0.81       299
           1       0.54      0.67      0.60       161
           2       0.59      0.88      0.71       113
           3       0.59      0.73      0.65        67
           4       0.35      0.81      0.49        70
           5       0.55      0.91      0.68        66
           6       0.52      0.84      0.64        45
           7       0.77      0.79      0.78       194

   micro avg       0.63      0.78      0.70      1015
   macro avg       0.60      0.80      0.67      1015
weighted avg       0.67      0.78      0.71      1015
 samples avg       0.63      0.82      0.65      1015

Confusion matrix:
 [[[143  38]
  [ 68 231]]

 [[228  91]
  [ 53 108]]

 [[299  68]
  [ 14  99]]

 [[379  34]
  [ 18  49]]

 [[305 105]
  [ 13  57]]

 [[364  50]
  [  6  60]]

 [[400  35]
  [  7  3

### Classfier Chains

In [25]:
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression

classifier = ClassifierChain(LogisticRegression())
classifier.fit(train_features, train_labels)
# we should optimise this a little

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

ClassifierChain(classifier=LogisticRegression(), require_dense=[True, True])

In [26]:
predicted_labels_cc = classifier.predict_proba(test_features)
evaluate(test_labels, predicted_labels_cc.todense())

Accuracy: 0.06041666666666667
Hamming Loss: 0.30390625
Brier Score: 1.8725155156759457
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.64      0.63       299
           1       0.31      0.26      0.28       161
           2       0.21      0.15      0.18       113
           3       0.10      0.04      0.06        67
           4       0.16      0.11      0.13        70
           5       0.04      0.02      0.02        66
           6       0.19      0.07      0.10        45
           7       0.38      0.29      0.33       194

   micro avg       0.40      0.32      0.35      1015
   macro avg       0.25      0.20      0.22      1015
weighted avg       0.36      0.32      0.33      1015
 samples avg       0.52      0.41      0.31      1015

Confusion matrix:
 [[[ 64 117]
  [109 190]]

 [[225  94]
  [119  42]]

 [[304  63]
  [ 96  17]]

 [[385  28]
  [ 64   3]]

 [[369  41]
  [ 62   8]]

 [[389  25]
  [ 65   1]]

 [[422  1

### Label Powerset
takes correlations into account!

In [27]:
from skmultilearn.problem_transform import LabelPowerset

classifier = LabelPowerset(LogisticRegression())
classifier.fit(train_features, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LabelPowerset(classifier=LogisticRegression(), require_dense=[True, True])

In [28]:
predicted_labels_lp = classifier.predict_proba(test_features)
evaluate(test_labels, predicted_labels_lp.todense())

Accuracy: 0.11458333333333333
Hamming Loss: 0.2526041666666667
Brier Score: 1.4573849779656916
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.74      0.68       299
           1       0.28      0.08      0.12       161
           2       0.36      0.04      0.06       113
           3       0.20      0.01      0.03        67
           4       0.25      0.03      0.05        70
           5       1.00      0.00      0.00        66
           6       1.00      0.00      0.00        45
           7       0.42      0.19      0.26       194

   micro avg       0.54      0.27      0.36      1015
   macro avg       0.52      0.14      0.15      1015
weighted avg       0.49      0.27      0.28      1015
 samples avg       0.67      0.36      0.34      1015

Confusion matrix:
 [[[ 51 130]
  [ 79 220]]

 [[285  34]
  [148  13]]

 [[360   7]
  [109   4]]

 [[409   4]
  [ 66   1]]

 [[404   6]
  [ 68   2]]

 [[414   0]
  [ 66   0]]

 

### Multi Label KNN

In [29]:
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix

mlknn = MLkNN(k=10)

x_train = lil_matrix(train_features).toarray()
y_train = lil_matrix(train_labels).toarray()
x_test = lil_matrix(test_features).toarray()

mlknn.fit(x_train, y_train)



MLkNN()

In [30]:
predicted_labels_mlknn = mlknn.predict_proba(x_test)
evaluate(test_labels, predicted_labels_mlknn.todense())

Accuracy: 0.08333333333333333
Hamming Loss: 0.2643229166666667
Brier Score: 1.8596905609591032
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.00      0.00       299
           1       1.00      0.00      0.00       161
           2       1.00      0.00      0.00       113
           3       1.00      0.00      0.00        67
           4       1.00      0.00      0.00        70
           5       1.00      0.00      0.00        66
           6       1.00      0.00      0.00        45
           7       1.00      0.00      0.00       194

   micro avg       1.00      0.00      0.00      1015
   macro avg       1.00      0.00      0.00      1015
weighted avg       1.00      0.00      0.00      1015
 samples avg       1.00      0.08      0.08      1015

Confusion matrix:
 [[[181   0]
  [299   0]]

 [[319   0]
  [161   0]]

 [[367   0]
  [113   0]]

 [[413   0]
  [ 67   0]]

 [[410   0]
  [ 70   0]]

 [[414   0]
  [ 66   0]]

 