In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import seaborn as sns
from sklearn import preprocessing
from datetime import datetime
from scipy import stats
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

sns.set()
sns.set(color_codes=True)
#sns.set_color_codes()

pd.options.display.max_rows = 15
pd.options.display.float_format = '{:,.3f}'.format

In [28]:
# 1- read data file
# 1- read processed file
file_dir = '../data/processed-data/'

non_normalized_data_file = 'missing_outliers_processed_dataset.csv'
normalized_data_file = 'normalized_dataset.csv'
factored_data_file = 'factored_dataset.csv'

data = pd.read_csv(file_dir + normalized_data_file)

In [29]:
data.head(5)

Unnamed: 0,ltcy,svc_cpu_use,svc_cpu_thr,svc_net_use,svc_disk_use,system_cpu_use,system_cpu_sat,system_net_use,svc_req_size,svc_resp_size,svc_pods,svc_req_rate
0,0.332,0.557,0.332,0.325,0.037,16.982,1.591,3.206,0.002,0.012,7.0,0.98
1,0.4,0.616,0.3,0.351,0.047,20.583,1.608,3.552,0.003,0.019,7.0,1.62
2,0.469,0.608,0.316,0.362,0.039,19.448,1.39,3.586,0.003,0.024,7.0,2.18
3,0.49,0.624,0.3,0.362,0.108,17.319,1.73,3.512,0.003,0.022,6.0,2.13
4,0.5,0.608,0.316,0.374,0.133,16.65,1.917,3.449,0.003,0.023,3.0,2.22


## Linear Regression Model

In [30]:
targets = data['ltcy']
inputs = data.drop(['ltcy'], axis=1)

#inputs_scaled = scaler.fit_transform(inputs)
#x_train, x_test, y_train, y_test = train_test_split(inputs_scaled, targets, test_size=0.2, random_state=365)

x_train, x_test, y_train, y_test = \
    train_test_split(inputs, targets, test_size=0.2, random_state=365)

# for better convergence and result scale target to values between 0 - 1
maxltcy = targets.max()
y_train = y_train / maxltcy
y_test = y_test / maxltcy



In [31]:
# standardize the features
scaler = StandardScaler()

# using SelectFromModel metatransformer to select features
clf = LassoCV()
sfm = SelectFromModel(clf, threshold=0.01)

# SVM
svr = LinearSVR(C=0.1, verbose=1)

pline = Pipeline([
    ('scaler', scaler),
    ('feature_selection', sfm),
    ('regression', svr)
])

kfold = KFold(n_splits=10)
results = cross_val_score(pline, x_train , y_train, cv=kfold)

print ("Done..!")

y_hat = pline.fit(x_train, y_train)

NameError: name 'pipeline' is not defined

In [None]:
y_hat = pline.predict(x_train)

In [None]:
plt.scatter(y_train, y_hat)
plt.xlabel('Latency', size=18)
plt.ylabel("predicted Latency", size=18)
#plt.xlim(-2,3)
#plt.ylim(-3,1)
plt.show()

In [None]:
sns.distplot(y_train - y_hat)
plt.title('Residual PDF', size=18)

In [None]:
# R-Squared
pline.score(x_train, y_train)

In [None]:
pline.named_steps.regression.intercept_

In [None]:
pline.named_steps.regression.coef_

In [None]:
selector = pline.named_steps.feature_selection
selected_features = selector.transform(inputs)
number_of_features = selected_features.shape[1]

features = []
for i in range(number_of_features):
    features.append('Feature_' + str(i))

reg_summary = pd.DataFrame(features, columns=['features'])
reg_summary['weights'] = pline.named_steps.regression.coef_
reg_summary

In [None]:
# validation mean_absolute_percentage_error
train_error =  np.abs(y_train - y_hat)
mean_error = np.mean(train_error)
min_error = np.min(train_error)
max_error = np.max(train_error)
std_error = np.std(train_error)

print("Model Result (mean and std of error): %.2f (%.2f) MSE" % (mean_error, std_error))

### Testing

In [None]:
y_hat_test = pline.predict(x_test)

In [None]:
plt.scatter(y_test, y_hat_test, alpha=0.2)
plt.xlabel('Latency', size=18)
plt.ylabel('Predicted Latency', size=18)
plt.show

In [None]:
perf = pd.DataFrame(np.exp(y_hat_test), columns=['prediction'])
y_test = y_test.reset_index(drop=True)
perf['target'] = np.exp(y_test)
perf.head()

In [None]:
# testing mean_absolute_percentage_error
perf['residual'] = perf['prediction'] - perf['target']
perf['difference%'] = np.absolute(perf['residual'] * 100 / perf['target'])
perf

In [None]:
perf.describe()

In [None]:
pd.options.display.max_rows = 999
pd.set_option('display.float_format', lambda x: '%.2f' % x)
perf.sort_values(by = ['difference%'])