# Multiple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy.ma as ma
from numpy import *
from daal4py import daalinit, daalfini, kmeans_init
%load_ext memory_profiler

## Testing PyDaal 

In [2]:
X = np.array([[1.,1.], [1.,4.], [1.,0.]])
daalinit()
result = kmeans_init(10, method = "plusPlusDense", distributed = True).compute(X)
daalfini()
print(result.centroids)

[[1. 0.]
 [1. 4.]
 [1. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]


## Importing the dataset

In [3]:
dataset = pd.read_json('/home/s.chakravarty/dataset/jun_jul.json')

In [5]:
dataset[0:6]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,73,74,75,76,77,78,79,80,81,82
0,Local Job Id,System Username (Deidentified),Shared,Cores,Nodes,Total Cores Available,Cpu Time,Node Time,Requested Nodes,Requested Wall Time,...,Net Ib0 Tx Packets,Net Mic0 Rx Packets,Net Mic0 Tx Packets,Net Mic1 Rx Packets,Net Mic1 Tx Packets,Net Eth0 Rx Cov,Net Eth0 Tx Cov,Net Ib0 Rx Cov,Net Ib0 Tx Cov,Parallel filesystem lustre bytes transmitted cov
1,563972,de39bdff0287b1df2046dd547f548b6ca49fce7e,0,8,1,8,2037440,254680,1,259200,...,906405393.4619594,2167.0031521339597,1040471.5134752289,,,,,0,0,0
2,563973,de39bdff0287b1df2046dd547f548b6ca49fce7e,0,8,1,8,2037440,254680,1,259200,...,906407273.1810366,2187.994318070176,1040498.2979645049,,,,,0,0,0
3,563986,de39bdff0287b1df2046dd547f548b6ca49fce7e,0,8,1,8,1998016,249752,1,259200,...,153256382.47338018,4341.010295438663,1015351.408077252,,,,,0,0,0
4,563988,de39bdff0287b1df2046dd547f548b6ca49fce7e,0,8,1,8,1997696,249712,1,259200,...,26235245.528879013,1981.0000399351834,1012809.0204173211,,,,,0,0,0
5,564043,d2ee7fd8efea767364c09cc1b758ee4a30e9db29,0,20,1,20,5184620,259231,1,259200,...,5739978.871336916,76541.91827502515,1105871.8192437731,76587.91822591031,1105916.8191957262,,,0,0,0


In [6]:
X = dataset.iloc[1:, [2,3,4,5,6,7,8,11,12,14]].values
X_float = X.astype(np.float)
X_float[0]

array([0.00000000e+00, 8.00000000e+00, 1.00000000e+00, 8.00000000e+00,
       2.03744000e+06, 2.54680000e+05, 1.00000000e+00, 2.20000000e+01,
       2.54680000e+05, 1.59102992e+09])

In [7]:
y = dataset.iloc[1:, 9].values
y_float = y.astype(np.float)
y_float[0]

259200.0

## Preprocessing 

### Obtain mean of columns as you need, nanmean is convenient.

In [8]:
col_mean = np.nanmean(y_float, axis=0)
col_mean

65344.54648279556

In [9]:
where_are_NaNs = isnan(y_float)
y_float[where_are_NaNs] = 0

In [10]:
array_sum = np.sum(y_float)
array_has_nan = np.isnan(array_sum)
print(array_has_nan)

False


## Splitting the dataset into the Training set and Test set

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_float, y_float, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression model on the Training set

In [12]:
regressor = LinearRegression()
%timeit regressor.fit(X_train, y_train)
%memit regressor.fit(X_train, y_train)

556 ms ± 1.89 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
peak memory: 9253.96 MiB, increment: 0.07 MiB


## Predicting the Test set results

In [13]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 57702.81  21600.  ]
 [ 39707.08   7200.  ]
 [251800.45 172800.  ]
 ...
 [ 44870.55  86400.  ]
 [ 82462.76 172800.  ]
 [ 93261.41 172800.  ]]
