In [150]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn import preprocessing
from datetime import datetime
from scipy import stats
from sklearn.decomposition import PCA

sns.set()
sns.set(color_codes=True)
#sns.set_color_codes()

pd.options.display.max_rows = 15
pd.options.display.float_format = '{:,.3f}'.format

In [151]:
# 1- read processed file
file_dir = '../data/processed-data/'
data_file = 'normalized_dataset.csv'

normalized_data = pd.read_csv(file_dir + data_file)
normalized_data.shape

(1584, 12)

In [None]:
targets = normalized_data['ltcy']
inputs = normalized_data.drop(['ltcy'], axis=1)


# Standardize the features (mean 0 and std 1)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
standarized_inputs = scaler.fit_transform(inputs)

standarized_inputs = pd.DataFrame(standarized_inputs, columns=inputs.columns)

In [152]:
standarized_inputs.describe()

Unnamed: 0,ltcy,svc_cpu_use,svc_cpu_thr,svc_net_use,svc_disk_use,system_cpu_use,system_cpu_sat,system_net_use,svc_req_size,svc_resp_size,svc_pods,svc_req_rate
count,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0
mean,0.777,1.256,0.623,0.828,0.225,4759.075,17.579,14.493,0.023,0.158,13.646,19.221
std,0.191,0.292,0.255,0.232,0.083,3018.836,11.862,5.392,0.012,0.113,4.475,9.134
min,0.332,0.332,0.0,0.19,0.0,14.165,0.699,2.864,0.0,0.001,2.0,0.2
25%,0.648,1.068,0.436,0.659,0.165,2060.612,6.428,10.329,0.014,0.038,10.0,11.775
50%,0.794,1.288,0.632,0.841,0.224,4814.819,17.424,15.167,0.023,0.163,14.0,20.47
75%,0.927,1.473,0.794,1.017,0.283,7545.077,26.365,18.156,0.031,0.254,18.0,25.98
max,1.217,1.822,1.245,1.29,0.44,9683.82,51.619,24.698,0.055,0.382,21.0,35.11


# Multicollinearity

Use Variance Inflation Factor (VIF) from the statmodels. VIF measures how big is the square root of the standard error is compared to the case there is no multicollinearity between the variables. 
Conventionally:

$VIF = 1$ means no multicollinearity

$1< VIF < 5$ perfectly okay

$10 < VIF$ unacceptable range (some times < 6 or 8)

In [153]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
variables = standarized_inputs[['svc_cpu_use'
            ,'svc_cpu_thr'
            ,'svc_net_use'
            ,'svc_disk_use'
            ,'system_cpu_use'
            ,'system_cpu_sat'
            ,'system_net_use'
            ,'svc_req_size'
            ,'svc_resp_size'
            ,'svc_pods']]
vif = pd.DataFrame()
vif['features'] = variables.columns
vif['VIF'] = [variance_inflation_factor(variables.values,i) for i in range(variables.shape[1])]
vif

Unnamed: 0,features,VIF
0,svc_cpu_use,32.969
1,svc_cpu_thr,10.759
2,svc_net_use,29.844
3,svc_disk_use,14.469
4,system_cpu_use,11.12
5,system_cpu_sat,7.73
6,system_net_use,72.605
7,svc_req_size,16.373
8,svc_resp_size,4.755
9,svc_pods,10.987


vif values are high. Consider using PCA or apply feature selection method to reduce dimentionality when using a ML model.

# Correlation Matrix

In [154]:
corr = standarized_inputs.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(3)

Unnamed: 0,ltcy,svc_cpu_use,svc_cpu_thr,svc_net_use,svc_disk_use,system_cpu_use,system_cpu_sat,system_net_use,svc_req_size,svc_resp_size,svc_pods,svc_req_rate
ltcy,1.0,0.389,0.421,0.413,0.39,0.381,0.39,0.532,0.497,0.363,0.022,0.562
svc_cpu_use,0.389,1.0,0.398,0.604,0.545,0.559,0.504,0.748,0.67,0.472,0.285,0.744
svc_cpu_thr,0.421,0.398,1.0,0.454,0.381,0.156,0.238,0.49,0.486,0.322,-0.062,0.51
svc_net_use,0.413,0.604,0.454,1.0,0.527,0.506,0.479,0.752,0.676,0.492,0.257,0.749
svc_disk_use,0.39,0.545,0.381,0.527,1.0,0.452,0.409,0.648,0.589,0.423,0.188,0.654
system_cpu_use,0.381,0.559,0.156,0.506,0.452,1.0,0.73,0.739,0.589,0.418,0.429,0.731
system_cpu_sat,0.39,0.504,0.238,0.479,0.409,0.73,1.0,0.689,0.553,0.396,0.28,0.689
system_net_use,0.532,0.748,0.49,0.752,0.648,0.739,0.689,1.0,0.848,0.616,0.352,0.985
svc_req_size,0.497,0.67,0.486,0.676,0.589,0.589,0.553,0.848,1.0,0.54,0.247,0.855
svc_resp_size,0.363,0.472,0.322,0.492,0.423,0.418,0.396,0.616,0.54,1.0,0.198,0.62


In [155]:
def plot_corr(df,size=10):
    '''Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot'''

    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)
    plt.xticks(range(len(corr.columns)), corr.columns);
    plt.yticks(range(len(corr.columns)), corr.columns);
    
#plot_corr(standarized_inputs,20)    

The function most_highly_correlated() will print out the linear correlation coefficients for each pair of variables in your data set, in order of the correlation coefficient. This lets you see very easily which pair of variables are most highly correlated.

In [156]:
def most_highly_correlated(mydataframe, numtoreport):
    # find the correlations
    cormatrix = mydataframe.corr()
    # set the correlations on the diagonal or lower triangle to zero,
    # so they will not be reported as the highest ones:
    cormatrix *= np.tri(*cormatrix.values.shape, k=-1).T
    # find the top n correlations
    cormatrix = cormatrix.stack()
    cormatrix = cormatrix.reindex(cormatrix.abs().sort_values(ascending=False).index).reset_index()
    # assign human-friendly names
    cormatrix.columns = ["FirstVariable", "SecondVariable", "Correlation"]
    return cormatrix.head(numtoreport)

mcdf = most_highly_correlated(standarized_inputs, 30)
mcdf = mcdf[mcdf.Correlation > 0.5]

mcdf

Unnamed: 0,FirstVariable,SecondVariable,Correlation
0,system_net_use,svc_req_rate,0.985
1,svc_req_size,svc_req_rate,0.855
2,system_net_use,svc_req_size,0.848
3,svc_net_use,system_net_use,0.752
4,svc_net_use,svc_req_rate,0.749
...,...,...,...
25,svc_req_size,svc_resp_size,0.540
26,ltcy,system_net_use,0.532
27,svc_net_use,svc_disk_use,0.527
28,svc_cpu_thr,svc_req_rate,0.510


# Features Selection

## SelectFromModel Meta-Transformer Method
sklearn doc: https://scikit-learn.org/stable/modules/feature_selection.html

example with regression case: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_select_from_model_boston.html


In [168]:
# using SelectFromModel metatransformer to select features
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

clf = LassoCV(normalize=False, random_state=365, cv=10, verbose=0) # normalize=False because data already scaled 

sfm = SelectFromModel(clf, threshold=-np.inf, max_features=3)  
#sfm = SelectFromModel(clf, threshold=0.001)  
sfm.fit(standarized_inputs, targets)

transformed_inputs = sfm.transform(standarized_inputs)

print('shape of transformed inputs %', transformed_inputs.shape)

transformed_inputs


shape of inputs % (1584, 11)
shape of standardized inputs % (1584, 11)
shape of transformed inputs % (1584, 3)


array([[-1.14575563, -1.48543676, -1.99771431],
       [-1.27010889, -1.48543676, -1.92762278],
       [-1.20637492, -1.48543676, -1.8662927 ],
       ...,
       [-0.52429145, -0.36786678,  1.60871363],
       [ 1.88967803,  0.3026752 ,  1.54300283],
       [ 1.97767941,  0.3026752 ,  1.60871363]])

In [169]:
# features selected

indices = sfm.get_support(indices=True)
all_features = inputs.columns

selected_features = []
for i in indices:
    selected_features.append(all_features[i])
    
selected_features    


['svc_cpu_thr', 'svc_pods', 'svc_req_rate']