In [1]:
import scipy.io
import mat73
import pandas as pd
import numpy as np
from numpy import array
from numpy.random import uniform
from numpy import hstack
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, LeakyReLU, BatchNormalization
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.svm import SVC as svm
from sklearn.linear_model import LogisticRegression as lg
from sklearn.metrics import confusion_matrix,accuracy_score,balanced_accuracy_score,f1_score
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.exceptions import ConvergenceWarning
from warnings import simplefilter,filterwarnings

# ignore all future warnings1
simplefilter(action='ignore', category=FutureWarning)



In [2]:
import os

DATA_DIR = ''
if 'google.colab' not in str(get_ipython()):
    if "anuja" in os.environ.get('USER'):
        DATA_DIR = 'data/'
    elif 'ubuntu' in os.environ.get('USER'):
        DATA_DIR = '/home/ubuntu/Martyna/repo/AI4Health/DATAfoof/datafoofed'
    

## Data Loading and PCA

In [3]:
path = os.path.join(DATA_DIR, 'split_data_csv')

#loading
xtrain = pd.read_csv(os.path.join(path, 'train_features_source.csv')).drop('IDs', axis = 1)
xtest = pd.read_csv(os.path.join(path, 'test_features_source_regression.csv')).drop('IDs', axis = 1)
ytrain = pd.read_csv(os.path.join(path, 'train_labels_regression.csv')).drop('IDs', axis = 1)
ytest = pd.read_csv(os.path.join(path, 'test_labels_regression.csv')).drop('IDs', axis = 1)


In [4]:
# dealing with NaNs
train_set = pd.concat([xtrain, ytrain], axis=1)
train_set = train_set.dropna()

labels_list = ['SRS_SCI_T', 'SRS_RRB_T', 'SWAN_IN_Avg', 'SWAN_HY_Avg', 'SCARED_P_GD','WISC_WMI_Sum', 'WISC_VCI_Sum']

xtrain = pd.DataFrame.copy(train_set)
xtrain = xtrain.drop(labels_list, axis=1)

ytrain = train_set[labels_list]


In [5]:
xtrain = np.array(xtrain)
xtest = np.array(xtest)
ytrain = np.array(ytrain) 
ytest = np.array(ytest) 

In [6]:
age_gender = True
if age_gender:
    train_age_gender = xtrain[:,-2:]
    test_age_gender = xtest[:,-2:]
    xtrain = xtrain[:,:-2]
    xtest = xtest[:,:-2]

In [7]:
# scaling x
norm = preprocessing.MinMaxScaler().fit(xtrain)

# transform training data
xtrain = norm.transform(xtrain)
xtest = norm.transform(xtest)
print(xtrain.shape, xtest.shape)

print('Applying PCA...')
pca = PCA(.95) # 95% variance retained
pca.fit(xtrain)

# transform data
xtrain = pca.transform(xtrain)
xtest = pca.transform(xtest)
print(xtrain.shape, xtest.shape)

(821, 5508) (249, 5508)
Applying PCA...
(821, 365) (249, 365)


In [8]:
if age_gender:
    xtrain = np.concatenate([xtrain, train_age_gender], axis = 1)
    xtest = np.concatenate([xtest, test_age_gender], axis = 1)

In [9]:
# scaling y
min_max_scaler = preprocessing.MinMaxScaler().fit(ytrain)
ytrain = min_max_scaler.transform(ytrain)
ytest = min_max_scaler.transform(ytest)
print(ytrain.shape, ytest.shape)

(821, 7) (249, 7)


In [10]:
ytrain.mean(axis=0)

array([0.41207651, 0.33093042, 0.61970492, 0.54637522, 0.25707132,
       0.4584729 , 0.49502042])

## MODELLING PART

### dummy regressors (to obtain the random baseline):

In [11]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error
from sklearn.dummy import DummyRegressor


lm_dummy_mean = DummyRegressor(strategy = 'mean').fit(xtrain, ytrain)
lm_dummy_median = DummyRegressor(strategy = 'median').fit(xtrain, ytrain)
ypred_dummy_mean = lm_dummy_mean.predict(xtest)
ypred_dummy_median = lm_dummy_median.predict(xtest)


print("Mean squared error (dummy): {:.2f}".format(mean_squared_error(ytest,
																	ypred_dummy_mean)))

print("Median absolute error (dummy): {:.2f}".format(median_absolute_error(ytest,
																	ypred_dummy_median)))

print("r2_score (dummy mean): {:.2f}".format(r2_score(ytest, ypred_dummy_mean)))
print("r2_score (dummy median): {:.2f}".format(r2_score(ytest, ypred_dummy_median)))


Mean squared error (dummy): 0.04
Median absolute error (dummy): 0.14
r2_score (dummy mean): -0.00
r2_score (dummy median): -0.02


In [12]:
print(r2_score(ytest, ypred_dummy_mean, multioutput='raw_values'))
print(r2_score(ytest, ypred_dummy_median, multioutput='raw_values'))


[-3.10213413e-05 -6.48839750e-04 -6.85648162e-04 -7.68523259e-03
 -6.64823365e-04 -4.30433157e-03 -1.92504557e-04]
[-0.00958698 -0.05027377 -0.00584059 -0.00165957 -0.03306712 -0.01620028
 -0.00175876]


In [13]:
# save predictions to csv file
ypred = np.array(ypred_dummy_mean)
np.savetxt('predicted_labels_reg_dummy_mean.csv', min_max_scaler.inverse_transform(ypred), delimiter=',')

## MOR regressor with base SVR regressor

In [14]:
n = 100

In [15]:
from sklearn.datasets import make_regression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.utils import resample

# Create the SVR regressor
svr = SVR(epsilon=0.2)
#Create the Multioutput Regressor
model = MultiOutputRegressor(svr)
# Train the regressor
#model = model.fit(xtrain, ytrain)
# Generate predictions for testing data
#ypred = model.predict(xtest)


mse = []
r = []
mae = []
rm = []
for i in range(n):
# Generate predictions for testing data
    # Resample
	resampled_xtest, resampled_ytest = resample(xtest, ytest, replace=True, n_samples=len(ytest), random_state=7+i)
	model = model.fit(xtrain, ytrain)
	ypred = model.predict(resampled_xtest)
	mse.append(mean_squared_error(resampled_ytest,ypred))
	r.append(model.score(resampled_xtest, resampled_ytest))
	mae.append(mean_absolute_error(resampled_ytest,ypred))
	rm.append(r2_score(resampled_ytest, ypred, multioutput='raw_values'))

print("Mean squared error (SVR): {:.2f} [{:.2f}, {:.2f}]".format(np.mean(mse), np.percentile(mse, 5), np.percentile(mse, 95)))
print("Mean abs error (SVR) {:.2f} [{:.2f}, {:.2f}]".format(np.mean(mae), np.percentile(mae, 5), np.percentile(mae, 95)))
print("R2 score (SVR): {:.2f} [{:.2f}, {:.2f}]".format(np.mean(r), np.percentile(r, 5), np.percentile(r, 95)))




Mean squared error (SVR): 0.05 [0.05, 0.05]
Mean abs error (SVR) 0.18 [0.17, 0.18]
R2 score (SVR): -0.10 [-0.12, -0.07]


In [16]:
rma = np.array(rm)
r=rma[:,0]


for i in range(7):
    r = rma[:,i]
    print("R2 score {:.3f} [{:.3f}, {:.3f}]".format(np.mean(r), np.percentile(r, 5), np.percentile(r, 95)))


R2 score -0.060 [-0.098, -0.017]
R2 score -0.107 [-0.170, -0.057]
R2 score -0.059 [-0.105, 0.002]
R2 score -0.101 [-0.209, -0.020]
R2 score -0.128 [-0.231, -0.065]
R2 score -0.089 [-0.155, -0.012]
R2 score -0.130 [-0.197, -0.068]


In [17]:
print(model.score(resampled_xtest, resampled_ytest))
print(r2_score(resampled_ytest, ypred, multioutput='raw_values'))

#np.mean(r2_score(resampled_ytest, ypred, multioutput='raw_values'))

-0.12360692923868911
[-0.10198171 -0.15346056 -0.07655882 -0.11168344 -0.23702304 -0.0791811
 -0.10535984]


In [18]:
# save predictions to csv file
ypred = np.array(ypred)

np.savetxt('predicted_labels_reg_MOR_SVR.csv', min_max_scaler.inverse_transform(ypred), delimiter=',')

In [19]:
%%capture

import sys, os

predicted_labels_shuffled = np.copy(ytest)
np.random.shuffle(predicted_labels_shuffled)

print("r2_score (labels_shuffled): {:.2f}".format(r2_score(ytest, predicted_labels_shuffled)))


r = []
for i in range(n):
# Generate predictions for testing data
    # Resample
	resampled_xtest, resampled_ytest = resample(xtest, ytest, replace=True, n_samples=len(ytest), random_state=7+i)
	ypred = model.predict(resampled_xtest)
	mse.append(mean_squared_error(resampled_ytest,predicted_labels_shuffled))
	r.append(model.score(resampled_xtest, resampled_ytest))
	mae.append(mean_absolute_error(resampled_ytest,predicted_labels_shuffled))


print("R2 score (labels_shuffled): {:.2f} [{:.2f}, {:.2f}]".format(np.mean(r), np.percentile(r, 5), np.percentile(r, 95)))
print(r2_score(resampled_ytest, ypred, multioutput='raw_values'))

In [20]:
# save predictions to csv file
ypred = np.array(ypred)
np.savetxt('predicted_labels_reg_shuffled.csv', min_max_scaler.inverse_transform(ypred), delimiter=',')

## MOR RandomForestRegressor

In [21]:
max_depth = 30
model = MultiOutputRegressor(
    RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=7)
)

mse = []
r = []
mae = []
for i in range(n):
# Generate predictions for testing data
    # Resample
	resampled_xtest, resampled_ytest = resample(xtest, ytest, replace=True, n_samples=len(ytest), random_state=7+i)
	model = model.fit(xtrain, ytrain)
	ypred = model.predict(resampled_xtest)
	mse.append(mean_squared_error(resampled_ytest,ypred))
	r.append(model.score(resampled_xtest, resampled_ytest))
	mae.append(mean_absolute_error(resampled_ytest,ypred))
	rm.append(r2_score(resampled_ytest, ypred, multioutput='raw_values'))

print("Mean squared error (RanFor): {:.2f} [{:.2f}, {:.2f}]".format(np.mean(mse), np.percentile(mse, 5), np.percentile(mse, 95)))
print("Mean abs error (RanFor) {:.2f} [{:.2f}, {:.2f}]".format(np.mean(mae), np.percentile(mae, 5), np.percentile(mae, 95)))
print("R2 score (RanFor): {:.2f} [{:.2f}, {:.2f}]".format(np.mean(r), np.percentile(r, 5), np.percentile(r, 95)))

Mean squared error (RanFor): 0.05 [0.04, 0.05]
Mean abs error (RanFor) 0.17 [0.16, 0.18]
R2 score (RanFor): -0.04 [-0.05, -0.03]


In [22]:
rma = np.array(rm)
r=rma[:,0]


for i in range(7):
    r = rma[:,i]
    print("R2 score {:.3f} [{:.3f}, {:.3f}]".format(np.mean(r), np.percentile(r, 5), np.percentile(r, 95)))

R2 score -0.047 [-0.094, -0.017]
R2 score -0.083 [-0.155, -0.030]
R2 score -0.032 [-0.099, 0.021]
R2 score -0.067 [-0.168, -0.011]
R2 score -0.106 [-0.225, -0.056]
R2 score -0.051 [-0.142, 0.006]
R2 score -0.082 [-0.187, -0.012]


In [23]:
print(r2_score(resampled_ytest, ypred, multioutput='raw_values'))


[-0.03082105 -0.06677911 -0.00432298 -0.03796293 -0.06280773  0.00578996
 -0.03430864]


In [24]:
# save predictions to csv file
ypred = np.array(ypred)
np.savetxt('predicted_labels_reg_MOR_randfor.csv', min_max_scaler.inverse_transform(ypred), delimiter=',')

## CHAINS

In [25]:
# chained multioutput regressor
from numpy import mean
from numpy import std
from numpy import absolute
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.multioutput import RegressorChain
from sklearn.ensemble import RandomForestRegressor
# define dataset
# define base model
model = RandomForestRegressor()
# define the chained multioutput wrapper model
model = RegressorChain(model)
# Train the regressor

mse = []
r = []
mae = []
for i in range(n):
# Generate predictions for testing data
    # Resample
	resampled_xtest, resampled_ytest = resample(xtest, ytest, replace=True, n_samples=len(ytest), random_state=7+i)
	model = model.fit(xtrain, ytrain)
	ypred = model.predict(resampled_xtest)
	mse.append(mean_squared_error(resampled_ytest,ypred))
	r.append(model.score(resampled_xtest, resampled_ytest))
	mae.append(mean_absolute_error(resampled_ytest,ypred))
	rm.append(r2_score(resampled_ytest, ypred, multioutput='raw_values'))
	
print("Mean squared error (RanFor): {:.2f} [{:.2f}, {:.2f}]".format(np.mean(mse), np.percentile(mse, 5), np.percentile(mse, 95)))
print("Mean abs error (RanFor) {:.2f} [{:.2f}, {:.2f}]".format(np.mean(mae), np.percentile(mae, 5), np.percentile(mae, 95)))
print("R2 score (RanFor): {:.2f} [{:.2f}, {:.2f}]".format(np.mean(r), np.percentile(r, 5), np.percentile(r, 95)))

Mean squared error (RanFor): 0.05 [0.04, 0.05]
Mean abs error (RanFor) 0.17 [0.16, 0.18]
R2 score (RanFor): -0.05 [-0.07, -0.03]


In [26]:
rma = np.array(rm)
r=rma[:,0]


for i in range(7):
    r = rma[:,i]
    print("R2 score {:.3f} [{:.3f}, {:.3f}]".format(np.mean(r), np.percentile(r, 5), np.percentile(r, 95)))

R2 score -0.045 [-0.089, -0.013]
R2 score -0.084 [-0.142, -0.032]
R2 score -0.039 [-0.105, 0.027]
R2 score -0.055 [-0.143, -0.007]
R2 score -0.082 [-0.187, -0.000]
R2 score -0.040 [-0.129, 0.006]
R2 score -0.086 [-0.181, -0.007]


In [27]:
print(r2_score(resampled_ytest, ypred, multioutput='raw_values'))

[-0.03451686 -0.08905578 -0.07795857 -0.00683827 -0.01454762 -0.01993593
 -0.01685514]


In [28]:
# save predictions to csv file
ypred = np.array(ypred)
np.savetxt('predicted_labels_reg_chain_randfor.csv', min_max_scaler.inverse_transform(ypred), delimiter=',')

In [29]:
# chained multioutput regressor
from numpy import mean
from numpy import std
from numpy import absolute
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.multioutput import RegressorChain
from sklearn.svm import SVR
# define dataset
# define base model
model = SVR()
# define the chained multioutput wrapper model
model = RegressorChain(model)

mse = []
r = []
mae = []
for i in range(n):
# Generate predictions for testing data
    # Resample
	resampled_xtest, resampled_ytest = resample(xtest, ytest, replace=True, n_samples=len(ytest), random_state=7+i)
	# Train the regressor
	model = model.fit(xtrain, ytrain)
	ypred = model.predict(resampled_xtest)
	mse.append(mean_squared_error(resampled_ytest,ypred))
	r.append(model.score(resampled_xtest, resampled_ytest))
	mae.append(mean_absolute_error(resampled_ytest,ypred))
	rm.append(r2_score(resampled_ytest, ypred, multioutput='raw_values'))

print("Mean squared error (SVR): {:.2f} [{:.2f}, {:.2f}]".format(np.mean(mse), np.percentile(mse, 5), np.percentile(mse, 95)))
print("Mean abs error (SVR) {:.2f} [{:.2f}, {:.2f}]".format(np.mean(mae), np.percentile(mae, 5), np.percentile(mae, 95)))
print("R2 score (SVR): {:.2f} [{:.2f}, {:.2f}]".format(np.mean(r), np.percentile(r, 5), np.percentile(r, 95)))

Mean squared error (SVR): 0.05 [0.05, 0.05]
Mean abs error (SVR) 0.18 [0.17, 0.19]
R2 score (SVR): -0.13 [-0.15, -0.10]


In [30]:
rma = np.array(rm)
r=rma[:,0]


for i in range(7):
    r = rma[:,i]
    print("R2 score {:.3f} [{:.3f}, {:.3f}]".format(np.mean(r), np.percentile(r, 5), np.percentile(r, 95)))

R2 score -0.056 [-0.104, -0.013]
R2 score -0.093 [-0.183, -0.034]
R2 score -0.062 [-0.169, 0.021]
R2 score -0.087 [-0.247, -0.007]
R2 score -0.095 [-0.214, -0.004]
R2 score -0.058 [-0.151, 0.006]
R2 score -0.103 [-0.204, -0.013]


In [31]:
print(r2_score(resampled_ytest, ypred, multioutput='raw_values'))


[-0.14353194 -0.20043965 -0.14241918 -0.16603182 -0.21332745 -0.08745421
 -0.12189979]


In [32]:
# save predictions to csv file
ypred = np.array(ypred)
np.savetxt('predicted_labels_reg_chain_svr.csv', min_max_scaler.inverse_transform(ypred), delimiter=',')