# Multiple Linear Regression

## Importing the libraries

In [1]:
import h2o4gpu
from h2o4gpu import DAAL_SUPPORTED
from h2o4gpu.util.metrics import mse
from h2o4gpu.util.metrics import mae
from h2o4gpu.model_selection import train_test_split

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import matplotlib
import matplotlib.pyplot as plt

import numpy as np
import numpy.ma as ma
from numpy import *

import pandas as pd

import sys

%reload_ext memory_profiler

print("all imports successful!")

all imports successful!


### Checking Conda Environment

In [2]:
print(sys.executable)

/home/chakravarty.s/miniconda3/envs/h2o/bin/python3


In [3]:
print(sys.version)

3.6.11 | packaged by conda-forge | (default, Aug  5 2020, 20:09:42) 
[GCC 7.5.0]


## Testing  h2oai /h2o4gpu 

In [4]:
X = np.array([[1.,1.], [1.,4.], [1.,0.]])
model = h2o4gpu.KMeans(n_clusters=2,random_state=1234).fit(X)
model.cluster_centers_

array([[1. , 0.5],
       [1. , 4. ]])

## Importing the dataset

In [5]:
pandas_df = pd.read_csv('/home/chakravarty.s/datasets/aug.csv')
pandas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 983991 entries, 0 to 983990
Data columns (total 83 columns):
 #   Column                                            Non-Null Count   Dtype  
---  ------                                            --------------   -----  
 0   Local Job Id                                      983991 non-null  int64  
 1   System Username (Deidentified)                    983991 non-null  object 
 2   Shared                                            983991 non-null  int64  
 3   Cores                                             983991 non-null  int64  
 4   Nodes                                             983991 non-null  int64  
 5   Total Cores Available                             983991 non-null  int64  
 6   Cpu Time                                          983991 non-null  int64  
 7   Node Time                                         983991 non-null  int64  
 8   Requested Nodes                                   983991 non-null  int64  
 9   Requ

In [7]:
pandas_df.head()

Unnamed: 0,Local Job Id,System Username (Deidentified),Shared,Cores,Nodes,Total Cores Available,Cpu Time,Node Time,Requested Nodes,Requested Wall Time,...,Net Ib0 Tx Packets,Net Mic0 Rx Packets,Net Mic0 Tx Packets,Net Mic1 Rx Packets,Net Mic1 Tx Packets,Net Eth0 Rx Cov,Net Eth0 Tx Cov,Net Ib0 Rx Cov,Net Ib0 Tx Cov,Parallel filesystem lustre bytes transmitted cov
0,585768,5daba4beef28e18c7f008f25124110c91aba5abb,0,640,32,640,61068160,3053408,32,172800.0,...,83106550000.0,59397.799397,455651.767202,59784.295585,442156.387134,,,0.06895,0.066064,3.549426
1,585769,5daba4beef28e18c7f008f25124110c91aba5abb,0,640,32,640,58293120,2914656,32,172800.0,...,81320980000.0,59634.924307,433592.103155,60048.889133,432517.120973,,,0.068929,0.066047,3.550658
2,585773,5daba4beef28e18c7f008f25124110c91aba5abb,0,640,32,640,54361600,2718080,32,259200.0,...,2391560000.0,59079.48844,407031.340753,59009.779747,407049.667957,,,0.064332,0.061032,4.442432
3,585774,5daba4beef28e18c7f008f25124110c91aba5abb,0,640,32,640,46678400,2333920,32,259200.0,...,2262686000.0,30563.448861,332175.233069,30858.081742,332155.461502,,,0.064349,0.061046,4.446053
4,585775,5daba4beef28e18c7f008f25124110c91aba5abb,0,640,32,640,37953920,1897696,32,259200.0,...,2282364000.0,34536.345662,275712.819236,34745.770934,275407.397537,,,0.064339,0.061073,4.457481


In [8]:
X_numpy_float_ndarray = pandas_df.iloc[1:, [2,3,4,5,6,7,8,11,12,14]].values.astype(float)
print(X_numpy_float_ndarray[0:2])
print(type(X_numpy_float_ndarray))

[[0.00000000e+00 6.40000000e+02 3.20000000e+01 6.40000000e+02
  5.82931200e+07 2.91465600e+06 3.20000000e+01 3.61330000e+05
  9.10830000e+04 1.59624213e+09]
 [0.00000000e+00 6.40000000e+02 3.20000000e+01 6.40000000e+02
  5.43616000e+07 2.71808000e+06 3.20000000e+01 3.66774000e+05
  8.49400000e+04 1.59624214e+09]]
<class 'numpy.ndarray'>


In [9]:
y_float_df = pandas_df.iloc[1:, [9]].astype(float)
print(y_float_df)
print(type(y_float_df))

        Requested Wall Time
1                  172800.0
2                  259200.0
3                  259200.0
4                  259200.0
5                  259200.0
...                     ...
983986                600.0
983987                600.0
983988              86400.0
983989              86400.0
983990                600.0

[983990 rows x 1 columns]
<class 'pandas.core.frame.DataFrame'>


## Preprocessing 

In [10]:
print(f"y[target] has NaNs? {y_float_df.isnull().values.any()}")
print(f"Number of NaNs in y[target] >> {y_float_df.isnull().values.sum()}")

y[target] has NaNs? True
Number of NaNs in y[target] >> 6


### Replace nan's with mean

In [11]:
y_float_df_no_nan = y_float_df.fillna(y_float_df.mean())

In [12]:
print(f"y[target] has NaNs? {y_float_df_no_nan.isnull().values.any()}")
print(f"Number of NaNs in y[target] >> {y_float_df_no_nan.isnull().values.sum()}")

y[target] has NaNs? False
Number of NaNs in y[target] >> 0


In [13]:
y_float_no_nan_ndarray = y_float_df_no_nan.to_numpy()
print(y_float_no_nan_ndarray)
print(type(y_float_no_nan_ndarray))

[[172800.]
 [259200.]
 [259200.]
 ...
 [ 86400.]
 [ 86400.]
 [   600.]]
<class 'numpy.ndarray'>


## Splitting the dataset into the Training set and Test set

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_numpy_float_ndarray, y_float_no_nan_ndarray, train_size = 0.9)
print(X_train[0:3])

[[1.00000000e+00 1.00000000e+00 1.00000000e+00 2.40000000e+01
  1.71700000e+03 1.71700000e+03 0.00000000e+00 3.20000000e+01
  1.71700000e+03 1.59788132e+09]
 [1.00000000e+00 4.00000000e+00 1.00000000e+00 2.40000000e+01
  6.52000000e+02 1.63000000e+02 0.00000000e+00 1.48000000e+02
  1.63000000e+02 1.59624397e+09]
 [0.00000000e+00 2.80000000e+01 1.00000000e+00 0.00000000e+00
  1.40280000e+04 5.01000000e+02 0.00000000e+00 5.33100000e+03
  5.01000000e+02 1.59752524e+09]]


In [15]:
print(y_train[0:3])

[[129600.]
 [ 14400.]
 [ 21600.]]


## Training the Multiple Linear Regression model on the Training set [sklearn]

In [16]:
import multiprocessing

cores = multiprocessing.cpu_count()
print(f"Number of CPU Cores Detected: {cores}")

Number of CPU Cores Detected: 28


In [17]:
from sklearn import linear_model

regr1 = linear_model.LinearRegression(n_jobs = 1)
regrn = linear_model.LinearRegression(n_jobs = cores/2)
regrall = linear_model.LinearRegression(n_jobs = -1)

%timeit regr1.fit(X_train, y_train)
%memit regr1.fit(X_train, y_train)
print("\n")
%timeit regrn.fit(X_train, y_train)
%memit regrn.fit(X_train, y_train)
print("\n")
%timeit regrall.fit(X_train, y_train)
%memit regrall.fit(X_train, y_train)

267 ms ± 8.41 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
peak memory: 1579.24 MiB, increment: 0.05 MiB


304 ms ± 2.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
peak memory: 1579.30 MiB, increment: 0.00 MiB


263 ms ± 3.63 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
peak memory: 1579.30 MiB, increment: 0.00 MiB


In [None]:
from h2o4gpu.solvers.linear_regression import LinearRegression

model = LinearRegression()
%timeit model.fit(X_train, y_train)
%memit model.fit(X_train, y_train)

In [34]:
if DAAL_SUPPORTED:
    print("DAAL supported by this Intel CPU™")
    lin_solver_daal = h2o4gpu.LinearRegression(fit_intercept = True, verbose = True, backend = 'daal', n_jobs = 1, n_gpus = 1)
    %timeit lin_solver_daal.fit(X_train, y_train)
    %memit lin_solver_daal.fit(X_train, y_train)
else:
    print("DAAL not supported by this Intel CPU™")
    print("\nTrying 1 CPU...\n")
    lin_solver_sklearn_1_cpu = h2o4gpu.LinearRegression(fit_intercept = True, verbose = True, backend = 'sklearn', n_jobs = 1)
    %timeit lin_solver_sklearn_1_cpu.fit(X_train, y_train)
    %memit lin_solver_sklearn_1_cpu.fit(X_train, y_train)
    print("\n Trying 2 CPUs...\n")
    lin_solver_sklearn_2_cpus = h2o4gpu.LinearRegression(fit_intercept = True, verbose = True, backend = 'sklearn', n_jobs = 2)
    %timeit lin_solver_sklearn_2_cpus.fit(X_train, y_train)
    %memit lin_solver_sklearn_2_cpus.fit(X_train, y_train)
    print("\n Trying 3 CPUs...\n")
    lin_solver_sklearn_3_cpus = h2o4gpu.LinearRegression(fit_intercept = True, verbose = True, backend = 'sklearn', n_jobs = 3)
    %timeit lin_solver_sklearn_3_cpus.fit(X_train, y_train)
    %memit lin_solver_sklearn_3_cpus.fit(X_train, y_train)
    print("\n Trying 4 CPUs...\n")
    lin_solver_sklearn_4_cpus = h2o4gpu.LinearRegression(fit_intercept = True, verbose = True, backend = 'sklearn', n_jobs = 4)
    %timeit lin_solver_sklearn_4_cpus.fit(X_train, y_train)
    %memit lin_solver_sklearn_4_cpus.fit(X_train, y_train)
    print("\n Trying 8 CPUs...\n")
    lin_solver_sklearn_8_cpus = h2o4gpu.LinearRegression(fit_intercept = True, verbose = True, backend = 'sklearn', n_jobs = 8)
    %timeit lin_solver_sklearn_8_cpus.fit(X_train, y_train)
    %memit lin_solver_sklearn_8_cpus.fit(X_train, y_train)
    print("\n Trying 12 CPUs...\n")
    lin_solver_sklearn_12_cpus = h2o4gpu.LinearRegression(fit_intercept = True, verbose = True, backend = 'sklearn', n_jobs = 12)
    %timeit lin_solver_sklearn_12_cpus.fit(X_train, y_train)
    %memit lin_solver_sklearn_12_cpus.fit(X_train, y_train)

DAAL not supported by this Intel CPU™

Trying 1 CPU...

Running sklearn Linear Regression
186 ms ± 2.95 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
peak memory: 1994.23 MiB, increment: 135.14 MiB

 Trying 2 CPUs...

Running sklearn Linear Regression
187 ms ± 3.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
peak memory: 1994.23 MiB, increment: 135.13 MiB

 Trying 3 CPUs...

Running sklearn Linear Regression
185 ms ± 733 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
peak memory: 1994.23 MiB, increment: 135.13 MiB

 Trying 4 CPUs...

Running sklearn Linear Regression
185 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
peak memory: 1994.23 MiB, increment: 135.13 MiB

 Trying 8 CPUs...

Running sklearn Linear Regression
187 ms ± 1.33 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
peak memory: 1994.23 MiB, increment: 135.13 MiB

 Trying 12 CPUs...

Running sklearn Linear Regression
192 ms ± 7.01 ms per loop (mean ± std. dev. of 7 run

## Predicting the Test set results

In [17]:
y_pred = linear_regressor.predict(X_test)
np.set_printoptions(precision = 2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[123116.13   2400.  ]
 [191892.25 172800.  ]
 [ 65312.42  14400.  ]
 ...
 [ 57670.69 172800.  ]
 [ 54941.66   7200.  ]
 [ 71545.7   21600.  ]]


In [21]:
y_scores_np_array = cp.asnumpy(y_pred)
print(f"y-scores: {y_scores_np_array}")
print(f"Number of y-scores: {len(y_scores_np_array)}")
print(f"Datatype: {type(y_scores_np_array)}")
print(f"3rd y-score: {y_scores_np_array[2]}")
print(f"3rd y-score datatype: {type(y_scores_np_array[2])}")

y-scores: [123116.13 191892.25  65312.42 ...  57670.69  54941.66  71545.7 ]
Number of y-scores: 98399
Datatype: <class 'numpy.ndarray'>
3rd y-score: 65312.42316958308
3rd y-score datatype: <class 'numpy.float64'>


In [22]:
y_true_np_array = cp.asnumpy(y_test)
print(f"y-trues:\n {y_true_np_array}")
print(f"Number of y-trues: {len(y_true_np_array)}")
print(f"Datatype: {type(y_true_np_array)}")
print(f"3rd y-true: {y_true_np_array[2]}")
print(f"3rd y-true datatype: {type(y_true_np_array[2])}")

y-trues:
 [[  2400.]
 [172800.]
 [ 14400.]
 ...
 [172800.]
 [  7200.]
 [ 21600.]]
Number of y-trues: 98399
Datatype: <class 'numpy.ndarray'>
3rd y-true: [14400.]
3rd y-true datatype: <class 'numpy.ndarray'>


In [38]:
print(f"R2 Score [cuml.metrics]: {mean_squared_error(y_true_np_array, y_scores_np_array)}")
print(f"Mean Absolute Error [sklearn.metrics]: {mean_squared_error(y_true_np_array, y_scores_np_array)}")
print(f"Mean Squared Error [sklearn.metrics]: {mean_squared_error(y_true_np_array, y_scores_np_array)}")

R2 Score [cuml.metrics]: 7599787876.979726
Mean Absolute Error [sklearn.metrics]: 7599787876.979726
Mean Squared Error [sklearn.metrics]: 7599787876.979726
