### MLP Regressor on sampled Kilo-Sky data

In [1]:
import pandas as pd

In [2]:
import numpy as np

#### Environment

In [3]:
import os
workdir = os.environ['SCRATCH']
os.environ["BOSS_DATA_URL"] = 'http://dr12.sdss3.org'
os.environ["BOSS_LOCAL_ROOT"] = os.path.join(workdir,'sdss')
os.environ["BOSS_SAS_PATH"] = '/sas/dr12/boss'
os.environ["BOSS_REDUX_VERSION"]='v5_7_0'

In [4]:
os.chdir(workdir)

In [5]:
os.getcwd()

'/scratch/kunjias'

In [6]:
workdir

'/scratch/kunjias'

In [7]:
! ls

home			kilo_sky_r5_features.hdf5  sdss
kilo_sky_features.hdf5	qso.dat			   sdss.building


In [8]:
kilo_sky_r5_table = pd.read_hdf('kilo_sky_r5_features.hdf5')

In [9]:
kilo_sky_r5_table.head()

Unnamed: 0,exposure_index,PLATE,MJD,XFOCAL,YFOCAL,FIBER,RA,DEC,OBJTYPE,AIRMASS_0,...,inverse_variance4119,inverse_variance4120,inverse_variance4121,inverse_variance4122,inverse_variance4123,inverse_variance4124,inverse_variance4125,inverse_variance4126,inverse_variance4127,inverse_variance4128
0,0,3844,55321,-93.327362,140.945984,566,180.40334,0.647407,SKY,0.0,...,4611232.0,3.3382400000000004e-33,0.0,0.0,0.0,-2.216037e-35,3.064008e-06,0.0,0.0,-0.029867
1,1,3844,55321,-93.327362,140.945984,566,180.40334,0.647407,SKY,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,3844,55321,-93.327362,140.945984,566,180.40334,0.647407,SKY,0.0,...,4611232.0,3.3382400000000004e-33,0.0,0.0,0.0,-2.216037e-35,3.064008e-06,0.0,0.0,-0.029867
3,3,3844,55321,-93.327362,140.945984,566,180.40334,0.647407,SKY,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,5399,55956,-119.386322,150.76239,640,185.17342,12.159917,SKY,0.0,...,7.174648e-43,-3.249857e+18,28537474.0,-1.079683e-15,12.765937,-1.816097e-37,9.403955e-38,0.0,1.3563159999999999e-19,0.0


In [10]:
kilo_sky_r5_table.shape

(5593, 20654)

In [11]:
list(kilo_sky_r5_table.columns.values)[8:13]

['OBJTYPE', 'AIRMASS_0', 'AIRMASS_1', 'AIRMASS_2', 'AIRMASS_3']

Drop the objtype column.

In [12]:
kilo_sky_r5_table.drop(kilo_sky_r5_table.columns[np.arange(8,13)], axis=1,inplace=True)

In [13]:
kilo_sky_r5_table.shape

(5593, 20649)

In [14]:
len(list(kilo_sky_r5_table.filter(regex='total_flux')))

4128

In [15]:
kilo_sky_r5_table.filter(regex='total_flux').shape

(5593, 4128)

In [16]:
type(kilo_sky_r5_table.filter(regex='total_flux'))

pandas.core.frame.DataFrame

In [17]:
X_r5 = kilo_sky_r5_table[kilo_sky_r5_table.columns.drop(list(kilo_sky_r5_table.filter(regex='total_flux')))]

In [18]:
X_r5.shape

(5593, 16521)

In [19]:
y = kilo_sky_r5_table[list(kilo_sky_r5_table.filter(regex='total_flux'))]

In [20]:
y.shape

(5593, 4128)

In [21]:
from sklearn.model_selection import train_test_split

Intermediate/ test split (Test Set)

In [22]:
X_intermediate, X_test, y_intermediate, y_test = train_test_split(X_r5,y,shuffle=True,test_size=0.2,random_state=5)

In [23]:
X_intermediate.shape

(4474, 16521)

In [24]:
X_test.shape

(1119, 16521)

Train / validation split (Train and Validation Set)

In [25]:
X_train, X_validation, y_train, y_validation = train_test_split(X_intermediate,y_intermediate,shuffle=False,test_size=0.5, random_state=5)

In [26]:
X_train.shape

(2237, 16521)

In [27]:
X_validation.shape

(2237, 16521)

In [42]:
4*4128+9

16521

Print proportions of datasets.

In [28]:
print('train: {}% | validation: {}% | test {}%'.format(round(y_train.shape[0]/y.shape[0],2),
                                                       round(y_validation.shape[0]/y.shape[0],2),
                                                       round(y_test.shape[0]/y.shape[0],2)))

train: 0.4% | validation: 0.4% | test 0.2%


In [29]:
X_train_filled = X_train.fillna(value=0)

In [30]:
X_test_filled = X_test.fillna(value=0)

In [31]:
y_train_filled = y_train.fillna(value=0)

In [32]:
y_test_filled = y_test.fillna(value=0)

In [33]:
X_validation_filled= X_validation.fillna(value=0)

In [34]:
y_validation_filled = y_validation.fillna(value=0)

In [35]:
from sklearn.preprocessing import StandardScaler

In [36]:
scaler = StandardScaler()

In [37]:
scaler.fit(X_train_filled)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [38]:
X_train_filled = scaler.transform(X_train_filled)

In [43]:
X_validation_filled = scaler.transform(X_validation_filled)

In [40]:
X_test_filled = scaler.transform(X_test_filled)

Traning the Model

In [41]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

In [47]:
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10]
print('-'*76)

for alpha in alphas:
    #fit the model
    mlp = MLPRegressor(hidden_layer_sizes=(100,100,100),alpha=alpha, max_iter=500,random_state=5)
    mlp.fit(X_train_filled, y_train_filled)
    
    #calculate errors
    new_train_error = mean_squared_error(y_train_filled, mlp.predict(X_train_filled))
    new_validation_error = mean_squared_error(y_validation_filled, mlp.predict(X_validation_filled))
    new_test_error = mean_squared_error(y_test_filled, mlp.predict(X_test_filled))
    
    #print error as report
    print('alpha: {:7} | train error: {:5} | val error: {:6} | test error: {}'.
          format(alpha,
                 round(new_train_error,3),
                 round(new_validation_error,3),
                 round(new_test_error,3)))

----------------------------------------------------------------------------
alpha:  0.0001 | train error: 1676.38 | val error: 7.095244977836777e+61 | test error: 2.5560264441424774e+48
alpha:   0.001 | train error: 1689.952 | val error: 6.610465758634658e+61 | test error: 2.6591589780001722e+48
alpha:    0.01 | train error: 1703.614 | val error: 9.041314339403207e+61 | test error: 2.3985321741152075e+48
alpha:     0.1 | train error: 1708.666 | val error: 7.979858480528871e+61 | test error: 2.568135116303518e+48
alpha:       1 | train error: 1765.872 | val error: 6.43840085498211e+61 | test error: 2.4604191564120335e+48
alpha:      10 | train error: 1669.985 | val error: 7.948636060834854e+61 | test error: 1.837568015432352e+48


In [None]:
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10]
print('-'*76)

for alpha in alphas:
    #fit the model
    mlp = MLPRegressor(hidden_layer_sizes=(100,100,100),activation = 'logistic',alpha=alpha, max_iter=500,random_state=5)
    mlp.fit(X_train_filled, y_train_filled)
    
    #calculate errors
    new_train_error = mean_squared_error(y_train_filled, mlp.predict(X_train_filled))
    new_validation_error = mean_squared_error(y_validation_filled, mlp.predict(X_validation_filled))
    new_test_error = mean_squared_error(y_test_filled, mlp.predict(X_test_filled))
    
    #print error as report
    print('alpha: {:7} | train error: {:5} | val error: {:6} | test error: {}'.
          format(alpha,
                 round(new_train_error,3),
                 round(new_validation_error,3),
                 round(new_test_error,3)))

----------------------------------------------------------------------------




alpha:  0.0001 | train error: 3303.899 | val error: 9191.024 | test error: 3779.058
alpha:   0.001 | train error: 3301.081 | val error: 9177.256 | test error: 3796.991


In [48]:
a = (100,)

In [43]:
mlp = MLPRegressor(hidden_layer_sizes=(100,100))

In [44]:
mlp.fit(X_train_filled,y_train_filled)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [33]:
mlp.fit(X_train_filled80,y_train_filled80)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

Calculation of training and testing error.

In [None]:
def calc_train_error(X_train, y_train, model):
    '''returns in-sample error for already fit model.'''
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    rmse = np.sqrt(mse)
    return mse

In [None]:
def calc_validation_error(X_test, y_test, model):
    '''returns out-of-sample error for already fit model.'''
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    return mse

In [None]:
def calc_metrics(X_train, y_train, X_test, y_test, model):
    '''fits model and returns the RMSE for in-sample error and out-of-sample error'''
    model.fit(X_train, y_train)
    train_error = calc_train_error(X_train, y_train, model)
    validation_error = calc_validation_error(X_test, y_test, model)
    return train_error, validation_error

In [36]:
predictions = mlp.predict(X_test_filled)

In [35]:
predictions20 = mlp.predict(X_test_filled20)

In [36]:
from sklearn.metrics import classification_report,confusion_matrix

In [37]:
print(confusion_matrix(y_test_filled, predictions20))

ValueError: continuous-multioutput is not supported

In [45]:
from sklearn.model_selection import KFold

In [46]:
X = np.array([[1,2],[3,4],[1,2],[3,4]])
y = np.array([1,2,3,4])
kf = KFold(n_splits=2)
kf.get_n_splits(X)

2

In [48]:
print(kf)

KFold(n_splits=2, random_state=None, shuffle=False)


In [51]:
X

array([[1, 2],
       [3, 4],
       [1, 2],
       [3, 4]])

In [52]:
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index,"TEST:", test_index)
    X_train, X_test = X[train_index],X[test_index]
    y_train, y_test = y[train_index],y[test_index]

TRAIN: [2 3] TEST: [0 1]
TRAIN: [0 1] TEST: [2 3]
