# Multiple Linear Regression

## Importing the libraries

In [1]:
import cudf
import cupy as cp
from cuml.cluster import KMeans
from cuml.preprocessing.model_selection import train_test_split
from cuml.linear_model import LinearRegression
from cuml.metrics.regression import r2_score

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import numpy as np
import numpy.ma as ma
from numpy import *

import pandas as pd

import matplotlib.pyplot as plt

import sys

%reload_ext memory_profiler

### Checking Conda Environment

In [2]:
print(sys.executable)

/home/eos/miniconda2/envs/rapids.ai-cuml/bin/python3


In [3]:
print(sys.version)

3.8.5 | packaged by conda-forge | (default, Sep 16 2020, 18:01:20) 
[GCC 7.5.0]


## Testing rapids/cuml

In [4]:
def np2cudf(df):
    '''converts numpy dataframe to cuda dataframe'''
    df = pd.DataFrame({'fea%d'%i:df[:,i] for i in range(df.shape[1])})
    pdf = cudf.DataFrame()
    
    for c,column in enumerate(df):
        pdf[str(c)] = df[column]
        return pdf

kmeans_float = KMeans(n_clusters=2).fit(np2cudf(np.array([[1.,1.], [1.,4.], [1.,0.]])))
print(kmeans_float.cluster_centers_)

0    1.0
1    1.0
dtype: float64


## Importing the dataset

In [5]:
cuda_df = cudf.read_csv('/home/eos/Downloads/aug.csv')
cuda_df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 983991 entries, 0 to 983990
Data columns (total 83 columns):
 #   Column                                            Non-Null Count   Dtype
---  ------                                            --------------   -----
 0   Local Job Id                                      983991 non-null  int64
 1   System Username (Deidentified)                    983991 non-null  object
 2   Shared                                            983991 non-null  int64
 3   Cores                                             983991 non-null  int64
 4   Nodes                                             983991 non-null  int64
 5   Total Cores Available                             983991 non-null  int64
 6   Cpu Time                                          983991 non-null  int64
 7   Node Time                                         983991 non-null  int64
 8   Requested Nodes                                   983991 non-null  int64
 9   Requested Wall Time    

In [6]:
cuda_df.tail()

Unnamed: 0,Local Job Id,System Username (Deidentified),Shared,Cores,Nodes,Total Cores Available,Cpu Time,Node Time,Requested Nodes,Requested Wall Time,...,Net Ib0 Tx Packets,Net Mic0 Rx Packets,Net Mic0 Tx Packets,Net Mic1 Rx Packets,Net Mic1 Tx Packets,Net Eth0 Rx Cov,Net Eth0 Tx Cov,Net Ib0 Rx Cov,Net Ib0 Tx Cov,Parallel filesystem lustre bytes transmitted cov
983986,35595636,81fa471af542d305ac76c24e3979beb60b84371a,0,7,0,0,0,0,0,600.0,...,,,,,,,,,,
983987,35595671,81fa471af542d305ac76c24e3979beb60b84371a,0,7,1,0,4291,613,0,600.0,...,,,,,,,,,,
983988,35595809,16cbafae067d7af77ec005a16e6d8fc03860c08d,0,6,0,0,0,0,0,86400.0,...,,,,,,,,,,
983989,35595821,16cbafae067d7af77ec005a16e6d8fc03860c08d,0,6,1,0,24,4,0,86400.0,...,,,,,,,,,,
983990,35595829,81fa471af542d305ac76c24e3979beb60b84371a,0,7,1,0,4396,628,0,600.0,...,,,,,,,,,,


In [7]:
X_cupy_ndarray = cuda_df.iloc[1:, [2,3,4,5,6,7,8,11,12,14]].values
X_cupy_float_ndarray = X_cupy_ndarray.astype(float)
X_cupy_float_ndarray[0]
print(X_cupy_float_ndarray)
print(type(X_cupy_float_ndarray))

[[0.00000000e+00 6.40000000e+02 3.20000000e+01 ... 3.61330000e+05
  9.10830000e+04 1.59624213e+09]
 [0.00000000e+00 6.40000000e+02 3.20000000e+01 ... 3.66774000e+05
  8.49400000e+04 1.59624214e+09]
 [0.00000000e+00 6.40000000e+02 3.20000000e+01 ... 3.78775000e+05
  7.29350000e+04 1.59624213e+09]
 ...
 [0.00000000e+00 6.00000000e+00 0.00000000e+00 ... 1.99000000e+02
  0.00000000e+00 1.59857146e+09]
 [0.00000000e+00 6.00000000e+00 1.00000000e+00 ... 9.50000000e+01
  4.00000000e+00 1.59857139e+09]
 [0.00000000e+00 7.00000000e+00 1.00000000e+00 ... 4.00000000e+00
  6.28000000e+02 1.59857198e+09]]
<class 'cupy.core.core.ndarray'>


In [9]:
y_cudf = cuda_df.iloc[1:, [9]]
y_float_cudf = y_cudf.astype(float)
print(y_float_cudf)
print(type(y_float_cudf))

        Requested Wall Time
1                  172800.0
2                  259200.0
3                  259200.0
4                  259200.0
5                  259200.0
...                     ...
983986                600.0
983987                600.0
983988              86400.0
983989              86400.0
983990                600.0

[983990 rows x 1 columns]
<class 'cudf.core.dataframe.DataFrame'>


## Preprocessing 

In [10]:
print(f"y[target] has NaNs? {y_float_cudf.isnull().values.any()}")
print(f"Number of NaNs in y[target] >> {y_float_cudf.isnull().values.sum()}")

y[target] has NaNs? True
Number of NaNs in y[target] >> 6


### Replace nan's with mean

In [11]:
y_float_cudf_no_nan = y_float_cudf.fillna(y_float_cudf.mean())

In [12]:
print(f"y[target] has NaNs? {y_float_cudf_no_nan.isnull().values.any()}")
print(f"Number of NaNs in y[target] >> {y_float_cudf_no_nan.isnull().values.sum()}")

y[target] has NaNs? False
Number of NaNs in y[target] >> 0


In [13]:
y_float_no_nan_cupy_ndarry = cp.array(y_float_cudf_no_nan.to_gpu_matrix())
print(y_float_no_nan_cupy_ndarry)
print(type(y_float_no_nan_cupy_ndarry))

[[172800.]
 [259200.]
 [259200.]
 ...
 [ 86400.]
 [ 86400.]
 [   600.]]
<class 'cupy.core.core.ndarray'>


## Splitting the dataset into the Training set and Test set

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_cupy_float_ndarray, y_float_no_nan_cupy_ndarry, train_size = 0.9)
print(X_train[0:3])

[[0.00000000e+00 8.00000000e+00 1.00000000e+00 2.40000000e+01
  1.15312000e+05 1.44140000e+04 0.00000000e+00 0.00000000e+00
  1.44140000e+04 1.59677454e+09]
 [0.00000000e+00 4.80000000e+01 2.00000000e+00 4.80000000e+01
  2.53440000e+04 1.05600000e+03 0.00000000e+00 1.20000000e+01
  5.28000000e+02 1.59729218e+09]
 [0.00000000e+00 5.44000000e+02 2.00000000e+00 5.44000000e+02
  8.70400000e+03 3.20000000e+01 0.00000000e+00 4.10000000e+01
  1.60000000e+01 1.59716857e+09]]


In [15]:
print(y_train[0:3])

[[ 14400.]
 [360000.]
 [  3600.]]


## Training the Multiple Linear Regression model on the Training set

In [16]:
linear_regressor = LinearRegression(fit_intercept = True, normalize = False, algorithm = "eig")
%timeit linear_regressor.fit(X_train, y_train)
%memit linear_regressor.fit(X_train, y_train)

50.2 ms ± 15.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
peak memory: 2903.94 MiB, increment: 1.14 MiB


## Predicting the Test set results

In [17]:
y_pred = linear_regressor.predict(X_test)
np.set_printoptions(precision = 2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[123116.13   2400.  ]
 [191892.25 172800.  ]
 [ 65312.42  14400.  ]
 ...
 [ 57670.69 172800.  ]
 [ 54941.66   7200.  ]
 [ 71545.7   21600.  ]]


In [21]:
y_scores_np_array = cp.asnumpy(y_pred)
print(f"y-scores: {y_scores_np_array}")
print(f"Number of y-scores: {len(y_scores_np_array)}")
print(f"Datatype: {type(y_scores_np_array)}")
print(f"3rd y-score: {y_scores_np_array[2]}")
print(f"3rd y-score datatype: {type(y_scores_np_array[2])}")

y-scores: [123116.13 191892.25  65312.42 ...  57670.69  54941.66  71545.7 ]
Number of y-scores: 98399
Datatype: <class 'numpy.ndarray'>
3rd y-score: 65312.42316958308
3rd y-score datatype: <class 'numpy.float64'>


In [22]:
y_true_np_array = cp.asnumpy(y_test)
print(f"y-trues:\n {y_true_np_array}")
print(f"Number of y-trues: {len(y_true_np_array)}")
print(f"Datatype: {type(y_true_np_array)}")
print(f"3rd y-true: {y_true_np_array[2]}")
print(f"3rd y-true datatype: {type(y_true_np_array[2])}")

y-trues:
 [[  2400.]
 [172800.]
 [ 14400.]
 ...
 [172800.]
 [  7200.]
 [ 21600.]]
Number of y-trues: 98399
Datatype: <class 'numpy.ndarray'>
3rd y-true: [14400.]
3rd y-true datatype: <class 'numpy.ndarray'>


In [38]:
print(f"R2 Score [cuml.metrics]: {mean_squared_error(y_true_np_array, y_scores_np_array)}")
print(f"Mean Absolute Error [sklearn.metrics]: {mean_squared_error(y_true_np_array, y_scores_np_array)}")
print(f"Mean Squared Error [sklearn.metrics]: {mean_squared_error(y_true_np_array, y_scores_np_array)}")

R2 Score [cuml.metrics]: 7599787876.979726
Mean Absolute Error [sklearn.metrics]: 7599787876.979726
Mean Squared Error [sklearn.metrics]: 7599787876.979726
