Load the data.

In [153]:
import pandas as pd
url='https://bd29ee0e-54ab-4daa-9671-d153865d1620.usrfiles.com/ugd/bd29ee_0e922a809bfe468b8552ddbd1e7753b2.csv'
df=pd.read_csv(url,header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,winter,small_,medium,8.0,9.8,60.8,6.238,578.0,105.0,170.0,50.0,0.0,0.0,0.0,0.0,34.2,8.3,0.0
1,spring,small_,medium,8.35,8.0,57.75,1.288,370.0,428.75,558.75,1.3,1.4,7.6,4.8,1.9,6.7,0.0,2.1
2,autumn,small_,medium,8.1,11.4,40.02,5.33,346.66699,125.667,187.05701,15.6,3.3,53.6,1.9,0.0,0.0,0.0,9.7
3,spring,small_,medium,8.07,4.8,77.364,2.302,98.182,61.182,138.7,1.4,3.1,41.0,18.9,0.0,1.4,0.0,1.4
4,autumn,small_,medium,8.06,9.0,55.35,10.416,233.7,58.222,97.58,10.5,9.2,2.9,7.5,0.0,7.5,4.1,1.0


In [154]:
df.shape

(200, 18)

The original dataset was very messy. Columns 3 to 10 did not only contain missing values that were marked with "XXXXXXX", but also contained entries that looked like "2.822008777.59961". To make it worse, columns 3 to 10 in the original dataset were of data type "object". So, the "mean()" method did not work on these columns. My original plan with columns 3 to 10 was that I was going to replace the cells with "XXXXXXX" with 0.0, then convert these columns into float, so that I could calcualte the means of these columns and replace the 0.0 with the mean of the corresponding column. However, as I applied the "dataframe.astype(float)" to these columns, it kept returning the same error message that Python could not covert string to float. Further probe revealed that these coulmns did not only contain "XXXXXXX", but also entries that looked like "2.822008777.59961". There were quite a few of them scatter around in columns 3 to 8. Finding them one by one would take a long time. 

On the other hand, MS Excel could recognize columns as numbers and calcualte the mean of each column, automatically skipping the cells with "XXXXXXX" and entries that are like "2.822008777.59961". I could nest the "ISNUMBER" function inside the "IF" function and easily take care of all those problematic cells. The "ISNUMBER" function tests whether a cell is a number. If it is, then keep that number; if it is not (i.e, XXXXXXX or 2.822008777.59961), the replace it with with the mean of the column. Very fast and convenient.

For the reasons above, I have cleaned the dataset before loading it here.

Verify that there is no empty cells.

In [155]:
df.isna().any()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
dtype: bool

Label-encode the first 3 columns.

In [156]:
from sklearn.preprocessing import LabelEncoder
df_object=df.iloc[:,0:3]
df_object=df_object.apply(LabelEncoder().fit_transform)
df_object.head()

Unnamed: 0,0,1,2
0,3,2,2
1,1,2,2
2,0,2,2
3,1,2,2
4,0,2,2


In [157]:
df_s=df.iloc[:,3:18]
df=pd.concat([df_object,df_s],axis=1)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,3,2,2,8.00,9.8,60.800000,6.238000,578.000000,105.000000,170.000000,50.000000,0.0,0.0,0.0,0.0,34.2,8.3,0.0
1,1,2,2,8.35,8.0,57.750000,1.288000,370.000000,428.750000,558.750000,1.300000,1.4,7.6,4.8,1.9,6.7,0.0,2.1
2,0,2,2,8.10,11.4,40.020000,5.330000,346.666990,125.667000,187.057010,15.600000,3.3,53.6,1.9,0.0,0.0,0.0,9.7
3,1,2,2,8.07,4.8,77.364000,2.302000,98.182000,61.182000,138.700000,1.400000,3.1,41.0,18.9,0.0,1.4,0.0,1.4
4,0,2,2,8.06,9.0,55.350000,10.416000,233.700000,58.222000,97.580000,10.500000,9.2,2.9,7.5,0.0,7.5,4.1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,2,8.40,8.4,17.375000,3.833000,83.750000,53.625000,79.750000,2.338000,12.7,21.7,5.6,0.0,1.0,0.0,0.0
196,1,0,2,8.30,10.6,14.320000,3.200000,125.333000,35.333000,75.904000,4.667000,18.0,7.0,1.7,0.0,4.8,10.3,1.0
197,0,0,2,8.20,7.0,139.989000,2.978000,60.110000,78.333000,140.220000,31.738000,0.0,15.9,2.4,1.0,0.0,0.0,0.0
198,3,0,2,8.00,7.6,43.636279,3.011558,154.447197,83.325566,111.550601,13.540729,0.0,12.5,3.7,1.0,0.0,0.0,4.9


Standardize the dataset.

In [158]:
from sklearn import preprocessing
scaler=preprocessing.StandardScaler()
df=pd.DataFrame(scaler.fit_transform(df))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1.212446,1.159382,1.09984,-0.019710,0.287463,3.770155e-01,1.499615e+00,2.370772e+00,1.870033e-01,5.761715e-01,1.843970e+00,-0.791883,-0.643226,-0.632861,-0.471679,2.907351,0.326073,-0.495313
1,-0.577142,1.159382,1.09984,0.568222,-0.470990,3.100197e-01,-8.010907e-01,1.206524e+00,2.980263e+00,4.408318e+00,-6.190889e-01,-0.726139,0.061730,0.009434,0.003880,0.108886,-0.546820,-0.080248
2,-1.471936,1.159382,1.09984,0.148270,0.961644,-7.943460e-02,1.077586e+00,1.075921e+00,3.653146e-01,7.443129e-01,1.041500e-01,-0.636915,4.328569,-0.378619,-0.471679,-0.572922,-0.546820,1.421892
3,-0.577142,1.159382,1.09984,0.097876,-1.819352,7.408577e-01,-3.297947e-01,-3.149358e-01,-1.910509e-01,2.676283e-01,-6.140313e-01,-0.646307,3.159826,1.896175,-0.471679,-0.430455,-0.546820,-0.218603
4,-1.471936,1.159382,1.09984,0.081078,-0.049627,2.573017e-01,3.441503e+00,4.436054e-01,-2.165893e-01,-1.377168e-01,-1.537884e-01,-0.359853,-0.374230,0.370725,-0.471679,0.190296,-0.115632,-0.297663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,-1.471936,-1.505864,1.09984,0.652213,-0.302445,-5.768510e-01,3.817972e-01,-3.957167e-01,-2.562514e-01,-3.134780e-01,-5.665909e-01,-0.195494,1.369609,0.116483,-0.471679,-0.471160,-0.546820,-0.495313
196,-0.577142,-1.505864,1.09984,0.484232,0.624554,-6.439566e-01,8.758575e-02,-1.629623e-01,-4.140716e-01,-3.513904e-01,-4.487990e-01,0.053393,0.006076,-0.405382,-0.471679,-0.084463,0.536409,-0.297663
197,-1.471936,-1.505864,1.09984,0.316251,-0.892353,2.116468e+00,-1.559739e-02,-5.280380e-01,-4.307500e-02,2.826118e-01,9.203479e-01,-0.791883,0.831616,-0.311714,-0.221385,-0.572922,-0.546820,-0.495313
198,1.212446,-1.505864,1.09984,-0.019710,-0.639535,-4.393168e-11,-2.091550e-11,-1.063496e-10,3.796239e-11,3.647317e-10,-1.618436e-10,-0.791883,0.516241,-0.137759,-0.221385,-0.572922,-0.546820,0.473172


This dataset is from the COIL 1999 challenge. I found it on UCI's KDD Archive. According to the descriptions of the dataset on the website, columns 0 to 10 are predictor variables that predictor variables that represent the seaon, river size and chemical concentration. Columns 11 to 17 are target variables that represent the algae population distribution.

Separate the predictor variables and target variables. The task is a multi-output regression.

In [159]:
X=df.iloc[:,3:11]
X

Unnamed: 0,3,4,5,6,7,8,9,10
0,-0.019710,0.287463,3.770155e-01,1.499615e+00,2.370772e+00,1.870033e-01,5.761715e-01,1.843970e+00
1,0.568222,-0.470990,3.100197e-01,-8.010907e-01,1.206524e+00,2.980263e+00,4.408318e+00,-6.190889e-01
2,0.148270,0.961644,-7.943460e-02,1.077586e+00,1.075921e+00,3.653146e-01,7.443129e-01,1.041500e-01
3,0.097876,-1.819352,7.408577e-01,-3.297947e-01,-3.149358e-01,-1.910509e-01,2.676283e-01,-6.140313e-01
4,0.081078,-0.049627,2.573017e-01,3.441503e+00,4.436054e-01,-2.165893e-01,-1.377168e-01,-1.537884e-01
...,...,...,...,...,...,...,...,...
195,0.652213,-0.302445,-5.768510e-01,3.817972e-01,-3.957167e-01,-2.562514e-01,-3.134780e-01,-5.665909e-01
196,0.484232,0.624554,-6.439566e-01,8.758575e-02,-1.629623e-01,-4.140716e-01,-3.513904e-01,-4.487990e-01
197,0.316251,-0.892353,2.116468e+00,-1.559739e-02,-5.280380e-01,-4.307500e-02,2.826118e-01,9.203479e-01
198,-0.019710,-0.639535,-4.393168e-11,-2.091550e-11,-1.063496e-10,3.796239e-11,3.647317e-10,-1.618436e-10


In [160]:
y=df.iloc[:,11:18]
y

Unnamed: 0,11,12,13,14,15,16,17
0,-0.791883,-0.643226,-0.632861,-0.471679,2.907351,0.326073,-0.495313
1,-0.726139,0.061730,0.009434,0.003880,0.108886,-0.546820,-0.080248
2,-0.636915,4.328569,-0.378619,-0.471679,-0.572922,-0.546820,1.421892
3,-0.646307,3.159826,1.896175,-0.471679,-0.430455,-0.546820,-0.218603
4,-0.359853,-0.374230,0.370725,-0.471679,0.190296,-0.115632,-0.297663
...,...,...,...,...,...,...,...
195,-0.195494,1.369609,0.116483,-0.471679,-0.471160,-0.546820,-0.495313
196,0.053393,0.006076,-0.405382,-0.471679,-0.084463,0.536409,-0.297663
197,-0.791883,0.831616,-0.311714,-0.221385,-0.572922,-0.546820,-0.495313
198,-0.791883,0.516241,-0.137759,-0.221385,-0.572922,-0.546820,0.473172


In [161]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
model=MultiOutputRegressor(GradientBoostingRegressor(loss='ls',learning_rate=0.001,n_estimators=500,max_features='sqrt',max_depth=50,random_state=0)).fit(X,y)
model

MultiOutputRegressor(estimator=GradientBoostingRegressor(learning_rate=0.001,
                                                         max_depth=50,
                                                         max_features='sqrt',
                                                         n_estimators=500,
                                                         random_state=0))

In [162]:
from sklearn.metrics import mean_squared_error as mse
import numpy as p
y_pred=pd.DataFrame(model.predict(X))
n_classes=y.shape[1]
for i in range(n_classes):
    print("The root mean square error for the ",i,"th target is ,", np.sqrt(mse(y.iloc[:,i],y_pred.iloc[:,i]))  )  

The root mean square error for the  0 th target is , 0.606378944861185
The root mean square error for the  1 th target is , 0.606378944861185
The root mean square error for the  2 th target is , 0.6063789448611852
The root mean square error for the  3 th target is , 0.6063789448611849
The root mean square error for the  4 th target is , 0.6063789448611849
The root mean square error for the  5 th target is , 0.6063789448611849
The root mean square error for the  6 th target is , 0.6063789448611849


In [163]:
from sklearn.metrics import r2_score as r2
for i in range(n_classes):
    print("The r-squared for the ",i,"th target is ,", r2(y.iloc[:,i],y_pred.iloc[:,i], multioutput='variance_weighted'))

The r-squared for the  0 th target is , 0.6323045752290359
The r-squared for the  1 th target is , 0.6323045752290359
The r-squared for the  2 th target is , 0.6323045752290357
The r-squared for the  3 th target is , 0.6323045752290359
The r-squared for the  4 th target is , 0.632304575229036
The r-squared for the  5 th target is , 0.6323045752290359
The r-squared for the  6 th target is , 0.6323045752290359


In [165]:
for i in range(n_classes):
    print("correlation coefficient for the ",i,"th target is ,", np.corrcoef(y.iloc[:,i],y_pred.iloc[:,i]))

correlation coefficient for the  0 th target is , [[1. 1.]
 [1. 1.]]
correlation coefficient for the  1 th target is , [[1. 1.]
 [1. 1.]]
correlation coefficient for the  2 th target is , [[1. 1.]
 [1. 1.]]
correlation coefficient for the  3 th target is , [[1. 1.]
 [1. 1.]]
correlation coefficient for the  4 th target is , [[1. 1.]
 [1. 1.]]
correlation coefficient for the  5 th target is , [[1. 1.]
 [1. 1.]]
correlation coefficient for the  6 th target is , [[1. 1.]
 [1. 1.]]


Load the test data.

In [166]:
url_t='https://bd29ee0e-54ab-4daa-9671-d153865d1620.usrfiles.com/ugd/bd29ee_fe6dd697d4b74c3f88fda9626f9c435c.csv'
df_t=pd.read_csv(url_t,header=None)
df_t

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,summer,small_,medium,7.95,5.7,57.3330,2.460,273.33301,295.66699,380.00000,10.912868,1.2,36.5,1.9,0.0,1.2,0.0,28.0
1,winter,small_,medium,7.98,8.8,59.3330,7.392,286.66699,33.33300,138.00000,7.100000,1.2,0.0,0.0,0.0,23.2,46.4,0.0
2,summer,small_,medium,8.00,7.2,80.0000,1.957,174.28600,47.85700,113.71400,4.500000,7.0,23.0,6.5,1.4,21.2,0.0,2.1
3,spring,small_,high__,8.35,8.4,68.0000,3.026,458.00000,45.20000,111.80000,3.200000,1.4,38.2,2.4,0.0,4.8,1.0,1.2
4,spring,small_,medium,8.10,13.2,19.0000,0.000,130.00000,6.00000,40.00000,2.000000,3.9,55.4,8.4,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,summer,medium,high__,8.12,10.2,7.6130,0.699,33.56000,28.03400,49.65800,2.200000,18.1,1.7,2.0,0.0,1.7,5.9,0.0
136,winter,large_,low___,8.43,10.8,35.6420,6.225,134.00000,103.50000,108.80637,45.375000,1.1,3.9,2.1,0.0,3.9,4.6,2.3
137,winter,large_,low___,8.70,11.7,21.4656,3.765,91.45000,38.00000,83.00000,17.000000,0.0,4.7,0.0,0.0,2.6,2.6,0.0
138,summer,large_,low___,8.10,8.2,26.5400,2.805,42.75000,48.50000,88.12500,13.980000,0.0,12.0,1.7,0.0,2.7,0.0,0.0


As with the train data, I partially cleaned the test data using the method described above before loading it here.

In [167]:
df_t.isna().any()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15     True
16     True
17     True
dtype: bool

Columns 15, 16, 17 have missing values. 

In [168]:
df_m=pd.DataFrame(df_t.iloc[:,15:18]).rename(columns={0:"15",1:"16",2:"17"})
p_m =pd.DataFrame(df_m.isnull().sum() * 100 / len(df_t)).rename(columns={0:"Percentage"})
p_m

Unnamed: 0,Percentage
15,0.714286
16,0.714286
17,5.0


The empty cells consist of very small proportions on this columns. We are going to fill them with the mean of the corresponding column.

In [169]:
df_m=df_m.fillna(df_m.mean())
df_m

Unnamed: 0,15,16,17
0,1.2,0.0,28.0
1,23.2,46.4,0.0
2,21.2,0.0,2.1
3,4.8,1.0,1.2
4,0.0,0.0,0.0
...,...,...,...
135,1.7,5.9,0.0
136,3.9,4.6,2.3
137,2.6,2.6,0.0
138,2.7,0.0,0.0


Verify that the empty cells have been filled.

In [170]:
df_m.isna().any()

15    False
16    False
17    False
dtype: bool

Merge the columns back to the test data.

In [171]:
df_t=df_t.iloc[:,0:15]
df_t=pd.concat([df_t,df_m],axis=1)
df_t

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,summer,small_,medium,7.95,5.7,57.3330,2.460,273.33301,295.66699,380.00000,10.912868,1.2,36.5,1.9,0.0,1.2,0.0,28.0
1,winter,small_,medium,7.98,8.8,59.3330,7.392,286.66699,33.33300,138.00000,7.100000,1.2,0.0,0.0,0.0,23.2,46.4,0.0
2,summer,small_,medium,8.00,7.2,80.0000,1.957,174.28600,47.85700,113.71400,4.500000,7.0,23.0,6.5,1.4,21.2,0.0,2.1
3,spring,small_,high__,8.35,8.4,68.0000,3.026,458.00000,45.20000,111.80000,3.200000,1.4,38.2,2.4,0.0,4.8,1.0,1.2
4,spring,small_,medium,8.10,13.2,19.0000,0.000,130.00000,6.00000,40.00000,2.000000,3.9,55.4,8.4,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,summer,medium,high__,8.12,10.2,7.6130,0.699,33.56000,28.03400,49.65800,2.200000,18.1,1.7,2.0,0.0,1.7,5.9,0.0
136,winter,large_,low___,8.43,10.8,35.6420,6.225,134.00000,103.50000,108.80637,45.375000,1.1,3.9,2.1,0.0,3.9,4.6,2.3
137,winter,large_,low___,8.70,11.7,21.4656,3.765,91.45000,38.00000,83.00000,17.000000,0.0,4.7,0.0,0.0,2.6,2.6,0.0
138,summer,large_,low___,8.10,8.2,26.5400,2.805,42.75000,48.50000,88.12500,13.980000,0.0,12.0,1.7,0.0,2.7,0.0,0.0


In [172]:
df_t_object=df_t.iloc[:,0:3]
df_t_object=df_t_object.apply(LabelEncoder().fit_transform)
df_t_object

Unnamed: 0,0,1,2
0,2,2,2
1,3,2,2
2,2,2,2
3,1,2,0
4,1,2,2
...,...,...,...
135,2,1,0
136,3,0,1
137,3,0,1
138,2,0,1


In [173]:
df_s=df_t.iloc[:,3:18]
df_t=pd.concat([df_t_object,df_s],axis=1)
df_t

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,2,2,2,7.95,5.7,57.3330,2.460,273.33301,295.66699,380.00000,10.912868,1.2,36.5,1.9,0.0,1.2,0.0,28.0
1,3,2,2,7.98,8.8,59.3330,7.392,286.66699,33.33300,138.00000,7.100000,1.2,0.0,0.0,0.0,23.2,46.4,0.0
2,2,2,2,8.00,7.2,80.0000,1.957,174.28600,47.85700,113.71400,4.500000,7.0,23.0,6.5,1.4,21.2,0.0,2.1
3,1,2,0,8.35,8.4,68.0000,3.026,458.00000,45.20000,111.80000,3.200000,1.4,38.2,2.4,0.0,4.8,1.0,1.2
4,1,2,2,8.10,13.2,19.0000,0.000,130.00000,6.00000,40.00000,2.000000,3.9,55.4,8.4,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,2,1,0,8.12,10.2,7.6130,0.699,33.56000,28.03400,49.65800,2.200000,18.1,1.7,2.0,0.0,1.7,5.9,0.0
136,3,0,1,8.43,10.8,35.6420,6.225,134.00000,103.50000,108.80637,45.375000,1.1,3.9,2.1,0.0,3.9,4.6,2.3
137,3,0,1,8.70,11.7,21.4656,3.765,91.45000,38.00000,83.00000,17.000000,0.0,4.7,0.0,0.0,2.6,2.6,0.0
138,2,0,1,8.10,8.2,26.5400,2.805,42.75000,48.50000,88.12500,13.980000,0.0,12.0,1.7,0.0,2.7,0.0,0.0


Standardize the test data.

In [174]:
df_t=pd.DataFrame(scaler.fit_transform(df_t))
df_t

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.538337,1.159999,1.111270,-0.048755,-1.598301,0.409818,-0.186692,0.512627,2.400913,2.775882e+00,2.087892e-10,-0.743580,2.903632,-0.287667,-0.612966,-0.522395,-0.532720,5.846406
1,1.446376,1.159999,1.111270,0.006193,-0.187414,0.459788,1.903523,0.582077,-0.357395,2.988200e-01,-2.932948e-01,-0.743580,-0.645599,-0.630804,-0.612966,1.772181,2.950837,-0.383961
2,0.538337,1.159999,1.111270,0.042825,-0.915614,0.976157,-0.399866,-0.003258,-0.204683,5.023348e-02,-4.932928e-01,-0.459926,1.590903,0.543084,0.025405,1.563583,-0.532720,0.083317
3,-0.369701,1.159999,-1.095507,0.683886,-0.369464,0.676335,0.053183,1.474462,-0.232620,3.064218e-02,-5.932919e-01,-0.733799,3.068939,-0.197368,-0.612966,-0.146919,-0.457644,-0.116945
4,-0.369701,1.159999,1.111270,0.225985,1.815136,-0.547939,-1.229256,-0.233921,-0.644788,-7.042878e-01,-6.855987e-01,-0.611534,4.741453,0.886221,-0.612966,-0.647553,-0.532720,-0.383961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,0.538337,-0.108750,-1.095507,0.262617,0.449761,-0.832445,-0.933015,-0.736227,-0.413112,-6.054305e-01,-6.702142e-01,0.082931,-0.480292,-0.269607,-0.612966,-0.470245,-0.089768,-0.383961
136,1.446376,-1.377499,0.007881,0.830414,0.722836,-0.132136,1.408940,-0.213087,0.380375,-4.752334e-10,2.650908e+00,-0.748471,-0.266366,-0.251547,-0.612966,-0.240788,-0.187368,0.127820
137,1.446376,-1.377499,0.007881,1.324946,1.132448,-0.486335,0.366376,-0.434708,-0.308324,-2.641487e-01,4.682364e-01,-0.802267,-0.188575,-0.630804,-0.612966,-0.376376,-0.337521,-0.383961
138,0.538337,-1.377499,0.007881,0.225985,-0.460489,-0.359551,-0.040478,-0.688361,-0.197922,-2.116902e-01,2.359310e-01,-0.802267,0.521272,-0.323787,-0.612966,-0.365947,-0.532720,-0.383961


Separate the predictor variables and target variables.

In [175]:
X_t=df_t.iloc[:,3:11]
X_t

Unnamed: 0,3,4,5,6,7,8,9,10
0,-0.048755,-1.598301,0.409818,-0.186692,0.512627,2.400913,2.775882e+00,2.087892e-10
1,0.006193,-0.187414,0.459788,1.903523,0.582077,-0.357395,2.988200e-01,-2.932948e-01
2,0.042825,-0.915614,0.976157,-0.399866,-0.003258,-0.204683,5.023348e-02,-4.932928e-01
3,0.683886,-0.369464,0.676335,0.053183,1.474462,-0.232620,3.064218e-02,-5.932919e-01
4,0.225985,1.815136,-0.547939,-1.229256,-0.233921,-0.644788,-7.042878e-01,-6.855987e-01
...,...,...,...,...,...,...,...,...
135,0.262617,0.449761,-0.832445,-0.933015,-0.736227,-0.413112,-6.054305e-01,-6.702142e-01
136,0.830414,0.722836,-0.132136,1.408940,-0.213087,0.380375,-4.752334e-10,2.650908e+00
137,1.324946,1.132448,-0.486335,0.366376,-0.434708,-0.308324,-2.641487e-01,4.682364e-01
138,0.225985,-0.460489,-0.359551,-0.040478,-0.688361,-0.197922,-2.116902e-01,2.359310e-01


In [176]:
y_t=df_t.iloc[:,11:18]
y_t

Unnamed: 0,11,12,13,14,15,16,17
0,-0.743580,2.903632,-0.287667,-0.612966,-0.522395,-0.532720,5.846406
1,-0.743580,-0.645599,-0.630804,-0.612966,1.772181,2.950837,-0.383961
2,-0.459926,1.590903,0.543084,0.025405,1.563583,-0.532720,0.083317
3,-0.733799,3.068939,-0.197368,-0.612966,-0.146919,-0.457644,-0.116945
4,-0.611534,4.741453,0.886221,-0.612966,-0.647553,-0.532720,-0.383961
...,...,...,...,...,...,...,...
135,0.082931,-0.480292,-0.269607,-0.612966,-0.470245,-0.089768,-0.383961
136,-0.748471,-0.266366,-0.251547,-0.612966,-0.240788,-0.187368,0.127820
137,-0.802267,-0.188575,-0.630804,-0.612966,-0.376376,-0.337521,-0.383961
138,-0.802267,0.521272,-0.323787,-0.612966,-0.365947,-0.532720,-0.383961


In [177]:
yhat_t=pd.DataFrame(model.predict(X_t))
yhat_t

Unnamed: 0,0,1,2,3,4,5,6
0,-0.217679,0.110370,0.184864,0.074479,-0.108980,-0.121381,0.052131
1,-0.071996,-0.083593,0.003087,-0.023524,0.214401,0.500007,-0.118857
2,-0.219370,-0.023607,0.377684,0.095027,0.021951,-0.092802,-0.077645
3,-0.183240,-0.016333,0.076906,0.116097,0.022715,0.172478,-0.018931
4,0.252290,-0.015283,0.098950,0.122950,-0.031237,-0.105622,-0.075961
...,...,...,...,...,...,...,...
135,0.379188,-0.176183,0.155435,-0.110680,0.089804,-0.151754,-0.118663
136,-0.232364,0.394457,-0.083329,-0.105028,-0.101312,0.124933,0.021929
137,-0.271678,0.210631,-0.200962,-0.182250,0.245031,-0.046605,-0.185638
138,-0.188371,0.115404,-0.078392,-0.058069,-0.119651,-0.178648,-0.038094


In [178]:
import numpy as np
n_classes=y.shape[1]
for i in range(n_classes):
    print("The root mean square error for the ",i,"th target is ,", np.sqrt(mse(y_t.iloc[:,i],yhat_t.iloc[:,i]))  )  

The root mean square error for the  0 th target is , 0.8107640242731087
The root mean square error for the  1 th target is , 0.9564975697472958
The root mean square error for the  2 th target is , 0.9488966309473309
The root mean square error for the  3 th target is , 0.94934541580956
The root mean square error for the  4 th target is , 0.9157113714139733
The root mean square error for the  5 th target is , 0.9224968020274101
The root mean square error for the  6 th target is , 0.9678014629595917


In [180]:
for i in range(n_classes):
    print("The correlation coefficient for the ",i,"th target is ,",np.corrcoef(y_t.iloc[:,i],yhat_t.iloc[:,i]))

The correlation coefficient for the  0 th target is , [[1.         0.70041651]
 [0.70041651 1.        ]]
The correlation coefficient for the  1 th target is , [[1.         0.30121144]
 [0.30121144 1.        ]]
The correlation coefficient for the  2 th target is , [[1.         0.35046058]
 [0.35046058 1.        ]]
The correlation coefficient for the  3 th target is , [[1.         0.33689899]
 [0.33689899 1.        ]]
The correlation coefficient for the  4 th target is , [[1.         0.54201447]
 [0.54201447 1.        ]]
The correlation coefficient for the  5 th target is , [[1.         0.46163839]
 [0.46163839 1.        ]]
The correlation coefficient for the  6 th target is , [[1.         0.27928086]
 [0.27928086 1.        ]]
