# Linear Regression

We will try to implement the linear regression method in one day, it will help us to juge the simplicity to implement this and we'll compare it with the other method and with the results obtained.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import sklearn.linear_model as sk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
# Load the dataset
# ts_data = pd.read_csv('data/dataHistorian.csv', sep=';')
ts_data = pd.read_csv('data/it-data-4metrics.csv', sep=',')

ts_data.head()

Unnamed: 0,metric_id,timestamp,value,metric_name,warn,crit,min,max
0,091c334c-a90a-4d8f-ba75-2c936220cd64,1575157723,13.375,cpu_prct_used,85.0,95.0,,
1,091c334c-a90a-4d8f-ba75-2c936220cd64,1575157423,13.5,cpu_prct_used,85.0,95.0,,
2,091c334c-a90a-4d8f-ba75-2c936220cd64,1575157123,13.375,cpu_prct_used,85.0,95.0,,
3,091c334c-a90a-4d8f-ba75-2c936220cd64,1575156823,13.5,cpu_prct_used,85.0,95.0,,
4,091c334c-a90a-4d8f-ba75-2c936220cd64,1575156523,13.75,cpu_prct_used,85.0,95.0,,


In [3]:
# Keep only the useful data
indexNames = ts_data[ ts_data['value'] <= 1.0 ].index
# Delete these row indexes from dataFrame
ts_data.drop(indexNames , inplace=True)
# Delete the useless columns
for column in ts_data.columns[3:]:
    ts_data.drop(column,1, inplace=True)
    
ts_data.head()

Unnamed: 0,metric_id,timestamp,value
0,091c334c-a90a-4d8f-ba75-2c936220cd64,1575157723,13.375
1,091c334c-a90a-4d8f-ba75-2c936220cd64,1575157423,13.5
2,091c334c-a90a-4d8f-ba75-2c936220cd64,1575157123,13.375
3,091c334c-a90a-4d8f-ba75-2c936220cd64,1575156823,13.5
4,091c334c-a90a-4d8f-ba75-2c936220cd64,1575156523,13.75


In [4]:
# Split the data into dummies
ts_data = pd.get_dummies(ts_data, prefix=['metric_id'], columns=['metric_id'])

ts_data.head()

Unnamed: 0,timestamp,value,metric_id_00f32458-39cf-4361-9abb-34247262b192,metric_id_01a8ad90-cba9-4f31-9c5e-6fa7c44428c3,metric_id_020bbd33-bfbd-4318-abce-f90ebd996a6f,metric_id_03660638-a475-409a-aa05-b6df0c998c94,metric_id_049d35fa-af41-4479-ba7b-55873bfc377e,metric_id_05398140-2e37-4635-ad89-b48dc87fb26e,metric_id_05caa75f-8c3b-4c86-a6a9-4bd227060f38,metric_id_061db7cd-67be-4efc-887c-87fd6f4b2f4a,...,metric_id_ee9b7711-cc03-4335-b3bc-a6fd644ed113,metric_id_eff3da11-6f06-48a4-8187-35fdcded15ec,metric_id_f23c1133-f91e-4a22-8edd-4e74c7df9fbe,metric_id_f2c46030-4a07-4693-86d8-4ff9c8dda8c8,metric_id_f48e4f1d-9056-4ccb-b392-907b0c226f61,metric_id_f5efa804-8b41-40c5-8101-34464c02fe7a,metric_id_f60944d3-4b66-4cdd-8e51-bc96fba215fa,metric_id_f71ffb8a-e342-48ef-99dd-81cb6b806241,metric_id_f943f1e8-4ebd-45b0-95ac-5fb0514e6b15,metric_id_fe897d86-1ea4-4e00-a018-5cd8702e96b3
0,1575157723,13.375,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1575157423,13.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1575157123,13.375,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1575156823,13.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1575156523,13.75,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Split the data into training/validating sets
training_limite = int(863*ts_data.shape[0]/1001)
training = ts_data.loc[:training_limite]

validating = ts_data.loc[training_limite+1:]

total = ts_data.loc[:]

In [6]:
# Split the outcome values from the input dataset
y_train = training['value']
x_train = training.drop(columns=['value'])

y_valid = validating['value']
x_valid = validating.drop(columns=['value'])

y_total = validating['value']
x_total = validating.drop(columns=['value'])

In [7]:
# Create linear regression object
regr = sk.LinearRegression()

# Train the model using the training sets
regr.fit(x_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(x_valid)


# # The coefficients
# print('Coefficients: ', regr.coef_)
# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_valid, y_pred))
#The mean absolute error
print('Mean absolute error: %.2f' % mean_absolute_error(y_valid, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: {} %'.format(r2_score(y_valid, y_pred)*100))

Mean squared error: 1209915752068.24
Mean absolute error: 330361.74
Coefficient of determination: 1.9663026081465307 %


In [8]:
df = pd.DataFrame({'Actual':y_valid, 'Predicted':y_pred})
df.sort_index()

Unnamed: 0,Actual,Predicted
234479,2094584.00,1.639843e+06
234480,2094008.00,1.639855e+06
234481,1405568.00,1.639867e+06
234482,1405712.00,1.639879e+06
234483,1405568.00,1.639891e+06
...,...,...
482388,1463.60,2.534682e+05
482389,1662.60,2.534803e+05
482390,1529.80,2.534923e+05
482391,1498.60,2.535044e+05
