# Getting started with the [Psychiatric Symptom Severity](https://health.aiaudit.org/web/challenges/challenge-page/338/overview) challenge!

This ipynb shows how to get started with the Multi Target Regression challenge and train a simple model, using both the features that have been extracted on sensor level (105 electrodes), and source level (68 parcels).

In [1]:
import os
import pandas as pd
import numpy as np

## Data (down)loading and preprocessing

Download the needed data from here: https://osf.io/2vw6j/. Detailed information about the data is available in the README folder.

**Load the data**

In [2]:
# MODIFY PATHS HERE ACCORDING TO YOUR ENVIRONMENT
PATH_TO_FEATURES = "../Features"
PATH_TO_LABELS = "../Labels"

In [3]:
# CHANGE THIS TO USE SENSOR OR SOURCE LEVEL DATA FOR TRAINING THE MODEL
data_level = "sensor" # or "source"

In [4]:
if data_level == "sensor":
    train_features = pd.read_csv(os.path.join(PATH_TO_FEATURES, 'Sensor level/train_features_sensor.csv'))
    test_features = pd.read_csv(os.path.join(PATH_TO_FEATURES, 'Sensor level/test_features_sensor_regression.csv'))
elif data_level == 'source':
    train_features = pd.read_csv(os.path.join(PATH_TO_FEATURES, 'Source level/train_features_source.csv'))
    test_features = pd.read_csv(os.path.join(PATH_TO_FEATURES, 'Source level/test_features_source_regression.csv'))
    

train_labels = pd.read_csv(os.path.join(PATH_TO_LABELS, 'train_labels_regression.csv'))

# make sure the features and labels are arranged correctly based on the IDs
train_features = train_features.sort_values('IDs').reset_index(drop=True)
train_labels = train_labels.sort_values('IDs').reset_index(drop=True)

assert((train_features['IDs']==train_labels['IDs']).all())

In [5]:
train_features

Unnamed: 0,IDs,Sex,Age,Electrode1_1_0_Hz,Electrode1_1_5_Hz,Electrode1_2_0_Hz,Electrode1_2_5_Hz,Electrode1_3_0_Hz,Electrode1_3_5_Hz,Electrode1_4_0_Hz,...,Electrode101_Intercept,Electrode101_Slope,Electrode102_Intercept,Electrode102_Slope,Electrode103_Intercept,Electrode103_Slope,Electrode104_Intercept,Electrode104_Slope,Electrode105_Intercept,Electrode105_Slope
0,1,0,13.460871,7.749357e-14,1.668887e-12,3.069289e-11,5.040632e-10,8.788607e-09,1.974149e-07,0.000005,...,1.051370,1.559212,0.947819,1.544160,1.139210,1.695536,1.277533,1.780115,0.934379,1.664599
1,2,0,11.809947,5.597417e-03,1.260221e-01,5.197147e-02,4.075601e-04,5.760053e-05,2.014237e-04,0.000646,...,1.209245,1.640646,1.025607,1.615676,1.107237,1.648802,1.334797,1.793597,1.133355,1.709400
2,4,1,11.916381,1.243993e-01,1.448890e-01,1.647281e-01,1.828163e-01,1.980511e-01,2.094376e-01,0.216196,...,0.495141,1.127287,0.465041,1.131478,0.702211,1.219153,0.455600,1.266118,0.090001,0.871724
3,5,0,11.180355,1.350305e-04,2.961527e-04,6.245381e-04,1.266374e-03,2.469020e-03,4.628570e-03,0.008343,...,1.252630,1.562021,1.009493,1.495463,0.949125,1.541345,1.001913,1.623235,0.999692,1.757107
4,7,0,7.992927,8.047736e-02,1.374599e-01,2.109479e-01,2.908519e-01,3.603012e-01,4.010115e-01,0.401001,...,0.796033,1.368490,0.787145,1.336666,0.839337,1.409471,1.157136,1.504866,1.140402,1.813248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435,2035,0,7.401551,2.458093e-04,7.834133e-04,2.259016e-03,5.893637e-03,1.391180e-02,2.971124e-02,0.057412,...,1.653040,2.033321,1.569902,1.907417,1.332606,1.800836,1.369386,1.862163,1.554822,2.176153
1436,2038,0,9.134610,3.925983e-04,2.202295e-03,9.428120e-03,3.080928e-02,7.689150e-02,1.467948e-01,0.215469,...,1.465955,1.759502,1.302440,1.802886,1.454564,1.908255,1.634953,1.959518,1.569862,2.104574
1437,2039,0,13.438512,1.326211e-03,2.024567e-03,3.042056e-03,4.499016e-03,6.549127e-03,9.383496e-03,0.013233,...,1.195508,1.812074,1.036099,1.737918,1.019589,1.705953,1.201961,1.791054,1.269082,1.987588
1438,2041,1,19.062057,4.803868e-11,3.315671e-10,2.180006e-09,1.335163e-08,7.536768e-08,3.901754e-07,0.000002,...,0.926275,1.556731,0.841937,1.439254,0.865929,1.421369,0.913359,1.522121,0.636227,1.503443


In [6]:
train_labels

Unnamed: 0,IDs,SRS_SCI_T,SRS_RRB_T,SWAN_IN_Avg,SWAN_HY_Avg,SCARED_P_GD,WISC_WMI_Sum,WISC_VCI_Sum
0,1,39.0,43.0,-1.555555,-2.333333,0.0,14.0,18.0
1,2,,,,,,,
2,4,60.0,60.0,2.000000,0.000000,5.0,8.0,13.0
3,5,55.0,66.0,0.333333,-0.444444,6.0,17.0,27.0
4,7,58.0,48.0,1.222222,1.222222,0.0,17.0,7.0
...,...,...,...,...,...,...,...,...
1435,2035,87.0,80.0,2.444444,1.111111,,18.0,20.0
1436,2038,,,,,,,
1437,2039,,,,,,,
1438,2041,,,-1.111111,-2.777777,3.0,,


**As you can see there are several Nan values, so we need to handle them. One way would be to not use those entries.**

In [7]:
labels_list = train_labels.columns

# dealing with NaNs
train_set = pd.concat([train_features, train_labels], axis=1)
train_set = train_set.dropna()

train_features = pd.DataFrame.copy(train_set)
train_features = train_features.drop(labels_list, axis=1)

train_labels = train_set[labels_list].drop(['IDs'], axis=1)

In [8]:
# converting to numpy for ease of usage
train_features = np.array(train_features)
test_features = np.array(test_features)
train_labels = np.array(train_labels) 

# storing the age and gender separately
train_age_gender = train_features[:, :2]
train_features = train_features[:, 2:] 

test_age_gender = test_features[:, 1:3]
test_features = test_features[:, 3:]

**Scaling features with Data normalisation**

In [9]:
# data normalization with sklearn
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm = MinMaxScaler().fit(train_features)

# normalise the data
train_features = norm.transform(train_features)
test_features = norm.transform(test_features)

print(train_features.shape, test_features.shape)

(795, 8505) (275, 8505)


**Dimensionality reduction using PCA**

In [10]:
# dimensionality reduction
from sklearn.decomposition import PCA

pca = PCA(.95) # 95% variance retained
pca.fit(train_features)

# transform data
train_features = pca.transform(train_features)
test_features = pca.transform(test_features)
print(train_features.shape, test_features.shape)

(795, 198) (275, 198)


In [11]:
# combine age and gender in the features set
train_features = np.concatenate([train_features, train_age_gender], axis = 1)
test_features = np.concatenate([test_features, test_age_gender], axis = 1)

## Training the model

Training a simple Random Forest regressor over the training set.

In [12]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor

regressor = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, max_depth=30, random_state=7))
regressor.fit(train_features, train_labels)

MultiOutputRegressor(estimator=RandomForestRegressor(max_depth=30,
                                                     random_state=7))

## Predicting labels

In [13]:
predicted_labels = regressor.predict(test_features)

assert(predicted_labels.shape == (275, 7)) # check if the labels shape is correct and submission ready! 

In [14]:
predicted_labels

array([[56.74      , 53.76      ,  0.55666642, ...,  3.9       ,
        19.29      , 20.94      ],
       [56.01      , 52.41      ,  0.89888858, ...,  3.91      ,
        18.67      , 21.22      ],
       [61.04      , 58.72      ,  0.76555528, ...,  5.68      ,
        19.7       , 21.56      ],
       ...,
       [59.51      , 57.5       ,  0.8266664 , ...,  5.17      ,
        17.51      , 19.84      ],
       [57.78      , 55.74      ,  0.83888859, ...,  5.43      ,
        19.11      , 19.89      ],
       [61.08      , 58.49      ,  0.81888862, ...,  4.26      ,
        19.38      , 21.44      ]])

**Save the labels and submit them on the platform for evaluation :D**

In [15]:
np.savetxt('predicted_labels_regression.csv', predicted_labels, delimiter=',')