# SLICENET QoE  from QoS Estimator

## Overview:
The POC experimental set up derives the application level QoE metrics by monitoring the Wordpress client’s response time with/without backgound stress. On one ICP, Jmeter acts as the client; it gets Wordpress pages and records the response time in its influxdb.  On another ICP, where the Wordpress service resides, Skydive collects QoS metrics, i.e. flows, related to the Wordpress service; Skydive stores its flows in elasticsearch. [Cognetive's network_stresser](https://github.com/cognetive/network_stresser/tree/master/tests/skydive_tests) is used to generate the background stress.




In [53]:
#!pip install cufflinks==0.8.2 --user

In [54]:
import pandas as pd
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
#import cufflinks as cf
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ResourceWarning)


In [55]:
from sklearn.model_selection        import train_test_split
from sklearn                        import metrics, svm
from sklearn.linear_model           import LinearRegression
from sklearn.linear_model           import LogisticRegression
from sklearn.tree                   import DecisionTreeClassifier
from sklearn.neighbors              import KNeighborsClassifier
from sklearn.discriminant_analysis  import LinearDiscriminantAnalysis
from sklearn.naive_bayes            import GaussianNB
from sklearn.svm                    import SVC
#from sklearn.linear_model           import Ridge
#from sklearn.linear_model           import Lasso
from sklearn.neural_network         import MLPClassifier
from sklearn.ensemble               import RandomForestClassifier
from sklearn.metrics                import f1_score
from sklearn.metrics                import accuracy_score
from sklearn.metrics                import precision_score
from sklearn.metrics                import recall_score


In [56]:
init_notebook_mode(connected=True)

In [57]:
# The code was removed by Watson Studio for sharing.

In [58]:
def plotJmeter(df,title):
    traces=[]
    for transaction in df['transaction'].unique():
        trace = go.Scatter(
            y=df[df['transaction']==transaction]['avg'],
            x=df[df['transaction']==transaction]['time'],
            mode = 'markers',
            connectgaps=True,
            name=transaction
        )    
        traces.append(trace)
  
       
    layout = go.Layout(
            barmode='group',
            title=title,
            xaxis=dict(
                title="time",
                #tickvals=xtickvals
            ),
            yaxis=dict(
                title="Response Time (ms)"
            )
    )

    fig = go.Figure(data=traces, layout=layout)
    iplot(fig)    

In [59]:
def plotJmeter2(df1,df2,title):
    traces=[]
    for transaction in df1['transaction'].unique():
        trace = go.Scatter(
            y=df1[df1['transaction']==transaction]['avg'],
            x=df1[df1['transaction']==transaction]['time'],
            mode = 'markers',
            connectgaps=True,
            name=transaction+" "+"with no background stress"
        )    
        traces.append(trace)
    for transaction in df2['transaction'].unique():
        trace = go.Scatter(
            y=df2[df2['transaction']==transaction]['avg'],
            x=df2[df2['transaction']==transaction]['time'],
            mode = 'markers',
            connectgaps=True,
            name=transaction+" "+"with background stress"
        )    
        traces.append(trace)        
  
       
    layout = go.Layout(
            barmode='group',
            title=title,
            xaxis=dict(
                title="time",
                #tickvals=xtickvals
            ),
            yaxis=dict(
                title="Response Time (ms)"
            )
    )

    fig = go.Figure(data=traces, layout=layout)
    iplot(fig)    

In [60]:
def plotSkydiveFlows2(df1,df2,title,y_feature,y_title):
    traces=[]
    for name in df1['_source.NodeTID'].unique():
        trace = go.Scatter(
            y=df1[df1['_source.NodeTID']==name][y_feature],
            x=df1[df1['_source.NodeTID']==name]['_source.Metric.Last'],
            mode = 'markers',
            #connectgaps=True,
            name=name+" with no background stress"
        )    
        traces.append(trace)
    for name in df2['_source.NodeTID'].unique():
        trace = go.Scatter(
            y=df2[df2['_source.NodeTID']==name][y_feature],
            x=df2[df2['_source.NodeTID']==name]['_source.Metric.Last'],
            mode = 'markers',
            #connectgaps=True,
            name=name+" with background stress"
        )    
        traces.append(trace)           
       
    layout = go.Layout(
            barmode='group',
            title=title,
            xaxis=dict(
                title="time",
                #tickvals=xtickvals
            ),
            yaxis=dict(
                title=y_title
            )
    )

    fig = go.Figure(data=traces, layout=layout)
    iplot(fig)    

In [61]:
def plotSkydiveFlows(df,title,y_feature,y_title):
    traces=[]
    for name in df['_source.NodeTID'].unique():
        trace = go.Scatter(
            y=df[df['_source.NodeTID']==name][y_feature],
            x=df[df['_source.NodeTID']==name]['_source.Metric.Last'],
            mode = 'markers',
            #connectgaps=True,
            name=name
        )    
        traces.append(trace)        
       
    layout = go.Layout(
            barmode='group',
            title=title,
            xaxis=dict(
                title="time",
            ),
            yaxis=dict(
                title=y_title
            )
    )

    fig = go.Figure(data=traces, layout=layout)
    iplot(fig)    

In [62]:
def plot4(df1,df2,df3,df4,title):
    traces=[]
    for transaction in df1['transaction'].unique():
        trace = go.Scatter(
            y=df1[df1['transaction']==transaction]['avg'],
            x=df1[df1['transaction']==transaction]['time'],
            mode = 'markers',
            connectgaps=True,
            name=transaction+" "+"with no background stress",
            text = ["with no background stress"],
            textposition='top right'
        )    
        traces.append(trace)
    for transaction in df2['transaction'].unique():
        trace = go.Scatter(
            y=df2[df2['transaction']==transaction]['avg'],
            x=df2[df2['transaction']==transaction]['time'],
            mode = 'markers',
            connectgaps=True,
            name=transaction+" "+"with background stress"
        )    
        traces.append(trace)
    for name in df3['_source.NodeTID'].unique():
        trace = go.Scatter(
            y=df3[df3['_source.NodeTID']==name]['_source.Metric.RTT'],
            x=df3[df3['_source.NodeTID']==name]['_source.Metric.Last'],
            mode = 'markers',
            connectgaps=True,
            name=name+" with no background stress",
            xaxis='x2',
            yaxis='y2'
        )    
        traces.append(trace)
    for name in df4['_source.NodeTID'].unique():
        trace = go.Scatter(
            y=df4[df4['_source.NodeTID']==name]['_source.Metric.RTT'],
            x=df4[df4['_source.NodeTID']==name]['_source.Metric.Last'],
            mode = 'markers',
            connectgaps=True,
            name=name+" with background stress",
            xaxis='x2',
            yaxis='y2'            
        )    
        traces.append(trace)             
  
       
    layout = go.Layout(
            barmode='group',
            title=title,
            xaxis=dict(
                title="time",
                #tickvals=xtickvals
            ),
            yaxis=dict(
                title="ms"
            )
    )
    layout = go.Layout(
        title=title,
        xaxis=dict(
                       
            domain=[0, 1]
        ),
        yaxis=dict(
            title="response time (QoE)",
            domain=[0.55, 1]
            
        ),
        xaxis2=dict(
            #title="time",
            domain=[0, 1],
            anchor='y2'
        ),


        yaxis2=dict(
            title="RTT (QoE)",
            domain=[0, 0.45]
            
        ),
        showlegend=False,
        annotations = [
            dict(
                  #x = -0.0951769406393,
                  x = 0.0951769406393,
                  #y = 0.96972670892,
                  y = 1,
                  showarrow = False,
                  text = "Without Stress",
                  xref = "paper",
                  yref = "paper"
            ),
            dict(
                  #x = -0.0951769406393,
                  #x = 0.2951769406393,
                  x = 0.8,
                  #y = 0.96972670892,
                  y = 1,
                  showarrow = False,
                  text = "With Stress",
                  xref = "paper",
                  yref = "paper"
            ),
            dict(
                  #x = -0.0951769406393,
                  x = 0.0951769406393,
                  y = 0.36972670892,
                  showarrow = False,
                  text = "Without Stress",
                  xref = "paper",
                  yref = "paper"
            ),
            dict(
                  #x = -0.0951769406393,
                  x = 0.8,
                  y = 0.36972670892,
                  showarrow = False,
                  text = "With Stress",
                  xref = "paper",
                  yref = "paper"
            )            
            
        ]

    )

    fig = go.Figure(data=traces, layout=layout)
    iplot(fig)    

In [63]:
def plotQoEQoS(df1,df2,df3,df4,y_feature,y_title1,y_title2,title):
    traces=[]
    for transaction in df1['transaction'].unique():
        trace = go.Scatter(
            y=df1[df1['transaction']==transaction]['avg'],
            x=df1[df1['transaction']==transaction]['time'],
            mode = 'markers',
            connectgaps=True,
            name=transaction+" "+"with no background stress",
            text = ["with no background stress"],
            textposition='top right'
        )    
        traces.append(trace)
    for transaction in df2['transaction'].unique():
        trace = go.Scatter(
            y=df2[df2['transaction']==transaction]['avg'],
            x=df2[df2['transaction']==transaction]['time'],
            mode = 'markers',
            connectgaps=True,
            name=transaction+" "+"with background stress"
        )    
        traces.append(trace)
    for name in df3['_source.NodeTID'].unique():
        trace = go.Scatter(
            y=df3[df3['_source.NodeTID']==name][y_feature],
            x=df3[df3['_source.NodeTID']==name]['_source.Metric.Last'],
            mode = 'markers',
            connectgaps=True,
            name=name+" with no background stress",
            xaxis='x2',
            yaxis='y2'
        )    
        traces.append(trace)
    for name in df4['_source.NodeTID'].unique():
        trace = go.Scatter(
            y=df4[df4['_source.NodeTID']==name][y_feature],
            x=df4[df4['_source.NodeTID']==name]['_source.Metric.Last'],
            mode = 'markers',
            connectgaps=True,
            name=name+" with background stress",
            xaxis='x2',
            yaxis='y2'            
        )    
        traces.append(trace)             
  
       
    layout = go.Layout(
            barmode='group',
            title=title,
            xaxis=dict(
                title="time",
                #tickvals=xtickvals
            ),
            yaxis=dict(
                title="ms"
            )
    )
    layout = go.Layout(
        title=title,
        xaxis=dict(
                       
            domain=[0, 1]
        ),
        yaxis=dict(
            title=y_title1,
            domain=[0.55, 1]
            
        ),
        xaxis2=dict(
            #title="time",
            domain=[0, 1],
            anchor='y2'
        ),


        yaxis2=dict(
            title=y_title2,
            domain=[0, 0.45]
            
        ),
        showlegend=False,
        annotations = [
            dict(
                  #x = -0.0951769406393,
                  x = 0.0951769406393,
                  #y = 0.96972670892,
                  y = 1,
                  showarrow = False,
                  text = "Without Stress",
                  xref = "paper",
                  yref = "paper"
            ),
            dict(
                  #x = -0.0951769406393,
                  #x = 0.2951769406393,
                  x = 0.8,
                  #y = 0.96972670892,
                  y = 1,
                  showarrow = False,
                  text = "With Stress",
                  xref = "paper",
                  yref = "paper"
            ),
            dict(
                  #x = -0.0951769406393,
                  x = 0.0951769406393,
                  y = 0.36972670892,
                  showarrow = False,
                  text = "Without Stress",
                  xref = "paper",
                  yref = "paper"
            ),
            dict(
                  #x = -0.0951769406393,
                  x = 0.8,
                  y = 0.36972670892,
                  showarrow = False,
                  text = "With Stress",
                  xref = "paper",
                  yref = "paper"
            )            
            
        ]

    )

    fig = go.Figure(data=traces, layout=layout)
    iplot(fig)    

In [64]:
def plot6(df1,df2,df3,df4,df5,df6,title):
    traces=[]
    for transaction in df1['transaction'].unique():
        trace = go.Scatter(
            y=df1[df1['transaction']==transaction]['avg'],
            x=df1[df1['transaction']==transaction]['time'],
            mode = 'markers',
            connectgaps=True,
            name=transaction+" "+"with no background stress",
            text = ["with no background stress"],
            textposition='top right'
        )    
        traces.append(trace)
    for transaction in df2['transaction'].unique():
        trace = go.Scatter(
            y=df2[df2['transaction']==transaction]['avg'],
            x=df2[df2['transaction']==transaction]['time'],
            mode = 'markers',
            connectgaps=True,
            name=transaction+" "+"with background stress"
        )    
        traces.append(trace)
    for name in df3['_source.NodeTID'].unique():
        trace = go.Scatter(
            y=df3[df3['_source.NodeTID']==name]['_source.Metric.RTT'],
            x=df3[df3['_source.NodeTID']==name]['_source.Metric.Last'],
            mode = 'markers',
            connectgaps=True,
            name=name+" with no background stress",
            xaxis='x2',
            yaxis='y2'
        )    
        traces.append(trace)
    for name in df4['_source.NodeTID'].unique():
        trace = go.Scatter(
            y=df4[df4['_source.NodeTID']==name]['_source.Metric.RTT'],
            x=df4[df4['_source.NodeTID']==name]['_source.Metric.Last'],
            mode = 'markers',
            connectgaps=True,
            name=name+" with background stress",
            xaxis='x2',
            yaxis='y2'            
        )    
        traces.append(trace)
    for name in df5['_source.NodeTID'].unique():
        trace = go.Scatter(
            y=df5[df5['_source.NodeTID']==name]['time_to_FIN'],
            x=df5[df5['_source.NodeTID']==name]['_source.Metric.Last'],
            mode = 'markers',
            connectgaps=True,
            name=name+" with no background stress",
            xaxis='x3',
            yaxis='y3'
        )    
        traces.append(trace)
    for name in df6['_source.NodeTID'].unique():
        trace = go.Scatter(
            y=df6[df6['_source.NodeTID']==name]['time_to_FIN'],
            x=df6[df6['_source.NodeTID']==name]['_source.Metric.Last'],
            mode = 'markers',
            connectgaps=True,
            name=name+" with background stress",
            xaxis='x3',
            yaxis='y3'            
        )    
        traces.append(trace)            
  
       
    layout = go.Layout(
            barmode='group',
            title=title,
            xaxis=dict(
                title="time",
                #tickvals=xtickvals
            ),
            yaxis=dict(
                title="ms"
            )
    )
    layout = go.Layout(
        title=title,
        xaxis=dict(                       
            domain=[0, 1],
            anchor='y2'
        ),
        yaxis=dict(
            title="response time (QoE)",
            domain=[0, 0]
            
        ),
        xaxis2=dict(
            #title="time",
            domain=[0.55, 1],
            anchor='y2'
            
        ),
        yaxis2=dict(
            title="RTT (QoE)",
            domain=[0, 0.45]
            
        ),
        xaxis3=dict(
            #title="time",
            domain=[0.55, 1],
            anchor='y3'
        ),
        yaxis3=dict(
            title="Time to FIN (QoE)",
            domain=[0, 1]
            
        ),        
        showlegend=False,
        annotations = [
            dict(
                  #x = -0.0951769406393,
                  x = 0.0951769406393,
                  #y = 0.96972670892,
                  y = 1,
                  showarrow = False,
                  text = "Without Stress",
                  xref = "paper",
                  yref = "paper"
            ),
            dict(
                  #x = -0.0951769406393,
                  #x = 0.2951769406393,
                  x = 0.8,
                  #y = 0.96972670892,
                  y = 1,
                  showarrow = False,
                  text = "With Stress",
                  xref = "paper",
                  yref = "paper"
            ),
            dict(
                  #x = -0.0951769406393,
                  x = 0.0951769406393,
                  y = 0.36972670892,
                  showarrow = False,
                  text = "Without Stress",
                  xref = "paper",
                  yref = "paper"
            ),
            dict(
                  #x = -0.0951769406393,
                  x = 0.8,
                  y = 0.36972670892,
                  showarrow = False,
                  text = "With Stress",
                  xref = "paper",
                  yref = "paper"
            )            
            
        ]

    )

    fig = go.Figure(data=traces, layout=layout)
    iplot(fig)    

In [65]:
def plotModelEstimates(actual,prediction,xTitle,yTitle,title,names):
    #s = list(map(str,df['begin']))
    #print(s)
    xList = list(range(len(actual)))
    actualBar = go.Bar(
        y=actual,
        #x=df[xFeature].astype(str),
        x = xList,
        name='actual',
        #text=actual,
        #textposition="outside",
        #textfont=dict(size=18),
        #marker=dict(line=dict(width=1.5)),
        #opacity=5,
        #width=10,
        orientation='vertical'
    )
    predictionBar = go.Bar(
        y=prediction,
        #x=df[xFeature].astype(str),
        x = xList,
        name='prediction',
        #text=prediction,
        #textposition="outside",
        #textfont=dict(size=18),
        #marker=dict(line=dict(width=1.5)),
        #opacity=5,
        #width=10,
        orientation='vertical'
    )
  
    
    data = [actualBar,predictionBar]
    if(len(names)>2):
        shapes=[
                        {
                            'type': 'line',                        
                            'x0': -1,
                            'y0': names[0], # use absolute value or variable here
                            'x1': len(xList),
                            'y1': names[0], 
                            'line': {
                                'color': 'rgb(50, 171, 96)',
                                'width': 4,
                                'dash': 'dashdot',
                            },
                        },                                        {
                            'type': 'line',                        
                            'x0': -1,
                            'y0': names[1], # use absolute value or variable here
                            'x1': len(xList),
                            'y1': names[1], # ditto
                            'line': {
                                'color': 'rgb(255,0,0)',
                                'width': 4,
                                'dash': 'dashdot',
                            },
                        },
                    ]
       
    else:
        shapes=[
                        {
                            'type': 'line',                        
                            'x0': -1,
                            'y0': names[0], # use absolute value or variable here
                            'x1': len(xList),
                            'y1': names[0], # ditto
                            'line': {
                                'color': 'rgb(255,0,0)',
                                'width': 4,
                                'dash': 'dashdot',
                            }
                        }
                    ]
    
    layout = go.Layout(
            barmode='group',
            title=title,
            xaxis=dict(
                title=xTitle,
                #tickvals=df[xFeature].astype(str),
                tickvals=xList                
            ),
            yaxis=dict(
                title=yTitle
            ),
            shapes=shapes
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [66]:
def plot4MsWithThreshholds(df,xFeature,yFeature,xTitle,yTitle,title,low,high):
    #s = list(map(str,df['begin']))
    #print(s)
    #xList = list(range(len(df['begin'])))
    xList = list(range(len(df[xFeature])))
    xList = list(df[xFeature].unique())
    minBar = go.Bar(
        y=df[yFeature]['min'],
        #x=df[xFeature].astype(str),
        x=xList,
        name='min',
        #text=df[yFeature]['min'],
        #textposition="outside",
        #opacity=5,
        #width=3,
        orientation='vertical'
    )
    meanBar = go.Bar(
        y=df[yFeature]['mean'],
        #x=df[xFeature].astype(str),
        x = xList,
        name='mean',
        #text=df[yFeature]['mean'],
        #textposition="outside",
        #textfont=dict(size=18),
        #marker=dict(line=dict(width=1.5)),
        #opacity=5,
        #width=10,
        orientation='vertical'
    )
    medianBar = go.Bar(
        y=df[yFeature]['median'],
        #x=df[xFeature].astype(str),
        x = xList,
        name='median',
        #text=df[yFeature]['median'],
        #textposition="outside",
        #textfont=dict(size=18),
        #marker=dict(line=dict(width=1.5)),
        #opacity=5,
        #width=10,
        orientation='vertical'
    )
    maxBar = go.Bar(
        y=df[yFeature]['max'],
        #x=df[xFeature].astype(str),
        x=xList,
        name='max',
        #text=df.duration['max'],
        #textposition="outside",
        #opacity=5,
        #width=1,
        orientation='vertical'
        
    )
    
    data = [maxBar,medianBar,meanBar,minBar]
    #data = [maxBar,meanBar,minBar]
    if low <= high:
            shapes=[
                        {
                            'type': 'line',                        
                            'x0': -1,
                            'y0': low, # use absolute value or variable here
                            'x1': len(xList),
                            'y1': low, # ditto
                            'line': {
                                'color': 'rgb(50, 171, 96)',
                                'width': 4,
                                'dash': 'dashdot',
                            }
                        },                                        {
                            'type': 'line',                        
                            'x0': -1,
                            'y0': high, # use absolute value or variable here
                            'x1': len(xList),
                            'y1': high, # ditto
                            'line': {
                                'color': 'rgb(255,0,0)',
                                'width': 4,
                                'dash': 'dashdot',
                            }
                        }
                    ]
    else:
            shapes=[
                        {
                            'type': 'line',                        
                            'x0': -1,
                            'y0': high, # use absolute value or variable here
                            'x1': len(xList),
                            'y1': high, # ditto
                            'line': {
                                'color': 'rgb(255,0,0)',
                                'width': 4,
                                'dash': 'dashdot',
                            }
                        }
                    ]
        
        
    layout = go.Layout(
            barmode='overlay',
            title=title,
            xaxis=dict(
                #tickformat = '%d',
                title=xTitle,
                #tickvals=df[xFeature].astype(str),
                tickvals=xList                
            ),
            yaxis=dict(
                title=yTitle
            ),
            shapes=shapes

    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [67]:
def plot4Ms(df,xFeature,yFeature,xTitle,yTitle,title):
    #s = list(map(str,df['begin']))
    #print(s)
    #xList = list(range(len(df['begin'])))
    xList = list(range(len(df[xFeature])))
    minBar = go.Bar(
        y=df[yFeature]['min'],
        #x=df[xFeature].astype(str),
        x=xList,
        name='min',
        #text=df[yFeature]['min'],
        #textposition="outside",
        #opacity=5,
        #width=3,
        orientation='vertical'
    )
    meanBar = go.Bar(
        y=df[yFeature]['mean'],
        #x=df[xFeature].astype(str),
        x = xList,
        name='mean',
        #text=df[yFeature]['mean'],
        #textposition="outside",
        #textfont=dict(size=18),
        #marker=dict(line=dict(width=1.5)),
        #opacity=5,
        #width=10,
        orientation='vertical'
    )
    medianBar = go.Bar(
        y=df[yFeature]['median'],
        #x=df[xFeature].astype(str),
        x = xList,
        name='median',
        #text=df[yFeature]['median'],
        #textposition="outside",
        #textfont=dict(size=18),
        #marker=dict(line=dict(width=1.5)),
        #opacity=5,
        #width=10,
        orientation='vertical'
    )
    maxBar = go.Bar(
        y=df[yFeature]['max'],
        #x=df[xFeature].astype(str),
        x=xList,
        name='max',
        #text=df.duration['max'],
        #textposition="outside",
        #opacity=5,
        #width=1,
        orientation='vertical'
        
    )
    
    data = [maxBar,medianBar,meanBar,minBar]
    #data = [maxBar,meanBar,minBar]
    layout = go.Layout(
            barmode='overlay',
            title=title,
            xaxis=dict(
                #tickformat = '%d',
                title=xTitle,
                #tickvals=df[xFeature].astype(str),
                tickvals=xList                
            ),
            yaxis=dict(
                title=yTitle
            )
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [68]:
def getdfLaterHalf(df,time):    
    halfway =  (df[time].min() + (df[time].max() - df[time].min()) / 2)    
    return(df[df[time] >= halfway])


In [69]:
def getRTTDiff(df):
    dfRTTDiff = df[['_source.TrackingID','_source.Metric.Last','_source.NodeTID','_source.Metric.RTT']].copy()
    dfRTTDiff.sort_values(by=['_source.TrackingID','_source.Metric.Last','_source.NodeTID'], ascending=[True,True,True], inplace=True)
    dfRTTDiff['diffs'] = dfRTTDiff.groupby(['_source.TrackingID'])['_source.Metric.RTT'].transform(lambda x: x.diff().abs())
    #print(dfRTTDiff.head(10))
    dfRTTDiff = dfRTTDiff.groupby(['_source.TrackingID'], as_index=False).agg({'_source.NodeTID': ' - '.join, '_source.Metric.Last': 'first','diffs' : 'last'})
    return(dfRTTDiff)


In [70]:
def massageSkydiveFlows(df):
   
    df = df[df['_source.Application'] == 'TCP']
    df = df[df['_source.Transport.Protocol'] == 'TCP']
    df = df[df['_source.Transport.B'] == 80]
    #print(df.count()) 
    df['flow_duration'] = df['_source.Metric.Last'] - df['_source.Metric.Start']
    df = df.dropna(subset=['flow_duration'])
    df['flow_duration'] = df['flow_duration'].astype(int).dropna()
    df['_source.Metric.RTT'] = df['_source.Metric.RTT'].astype(int).dropna()
    df['_source.Metric.ABBytes'] = df['_source.Metric.ABBytes'].astype(int).dropna()
    df['_source.Metric.BABytes'] = df['_source.Metric.BABytes'].astype(int).dropna()
    df['_source.Metric.ABPackets'] = df['_source.Metric.ABPackets'].astype(int).dropna()
    df['_source.Metric.BAPackets'] = df['_source.Metric.BAPackets'].astype(int).dropna() 

    df['bytes_per_flow'] = (df['_source.Metric.ABBytes'] + df['_source.Metric.BABytes']) / df['flow_duration']
    df['AB_bytes_per_flow'] = df['_source.Metric.ABBytes']  / df['flow_duration']
    df['BA_bytes_per_flow'] = df['_source.Metric.BABytes']  / df['flow_duration']
    
    df['packets_per_flow'] = (df['_source.Metric.ABPackets'] + df['_source.Metric.BAPackets']) / df['flow_duration']
    df['AB_packets_per_flow'] = df['_source.Metric.ABPackets'] / df['flow_duration']
    df['BA_packets_per_flow'] = df['_source.Metric.BAPackets'] / df['flow_duration']
    #df['flow_duration'] = df['flow_duration'].apply(lambda x: x*100) # convert to ms
    #print(dfSkydiveFlowsNoStress[['flow_duration','_source.Metric.RTT']].head(10)) 
    df['_source.Metric.Last'] = pd.to_datetime(df['_source.Metric.Last'],unit='ms')
    df['_source.Metric.Start'] = pd.to_datetime(df['_source.Metric.Start'],unit='ms') 
    #df['_source.NodeTID'] = df['_source.NodeTID'].replace(['c989a779-c295-52da-6041-3f287488faa1'],'wp_eth0')
    #df['_source.NodeTID'] = df['_source.NodeTID'].replace(['8c451cd5-a37f-5e51-7d89-8c2cc2c5e88d'],'calico')
    #df = df[df['_source.NodeTID'] != 'b40081de-0060-5816-54c4-c3907e470941']
    #q = df['_source.Metric.RTT'].quantile(0.99)
    #df = df[df['_source.Metric.RTT']<q]

    #q = df['flow_duration'].quantile(0.99)
    #df = df[df['flow_duration']<q] 
    #print(df.count()) 
    return(df)
    

In [71]:
def massageJmeter(df):
    df = df[df.transaction != 'internal']
    #df = df[df['transaction'] == 'all']  # for now just to get an overall average
    df = df[df['statut'] == 'ok']
    df["time"] = pd.to_datetime(df["time"],unit='ms')
    #df["avg"] = df["avg"].apply(lambda x: x*1000000)
    df = df.dropna(subset=['avg'])
    df['avg'] = df['avg'].astype(int)
    q = df['avg'].quantile(0.99)
    df = df[df['avg']<q]
    return(df)    
   

In [72]:
def massageIndex(df,begin_plus,end_less):
    df.begin = df.begin + (begin_plus*60000)
    df.end = df.end - (end_less*60000)
    df["stress_test"] = df["stress_test"].astype(str)
    dfIndex["stress_test"] = dfIndex["stress_test"].replace("no_stress"," no_stress")
    df["stress_test"] = df["stress_test"].replace(to_replace=r'^stress-*',value='',regex=True)
    df["stress_test"] = df["stress_test"].replace(to_replace=r'_test.yaml',value='',regex=True)
    df["stress_test"] = df["stress_test"].replace(to_replace=r'r1g1',value='',regex=True)    
    return(df)

In [73]:
def labelDF(dfLabels,df,time):
    a = df[time].values
    bh = dfLabels.end.values
    bl = dfLabels.begin.values

    i, j = np.where((a[:, None] > bl) & (a[:, None] < bh))

    return (pd.DataFrame(
        np.column_stack([df.values[i], dfLabels.values[j]]),
        columns=df.columns.append(dfLabels.columns)
    ))

In [74]:
def maxMedian(dfIndex,dfJmeter):
    maxM=0
    maxBegin=0
    maxEnd=0
    maxStress=''
    noStressIndex=0
    for index, row in dfIndex.iterrows():
        if row.stress_test == "no_stress":
            noStressIndex=index
            continue
        df=dfJmeter[(dfJmeter['time'] > row.begin) & (dfJmeter['time'] < row.end)]
        m=df.avg.median()
        if m > maxM:
            maxM=m
            maxIndex=index
            maxStress=row.stress_test
            maxNoStressPair=noStressIndex
    return(maxIndex,maxNoStressPair)        
        


In [75]:
def getDFs(suffixes):
    jMeterFrame = []
    skydiveFrame = []
    indexFrame = []
    for s in suffixes:
        dfTemp,err = getDF("jmeter{}.csv".format(s))
        if err != "":
            print(err)
            raise Error(err)
        jMeterFrame.append(dfTemp)    
       
    
        dfTemp,err = getDF("skydiveFlows{}.csv".format(s))
        if err != "":
            print(err)
            raise Error(err)
        skydiveFrame.append(dfTemp)
        
        dfTemp,err = getDF("workload_stress_begin_end_{}.csv".format(s))    
        if err != "":
            print(err)
            raise Error(err)
        indexFrame.append(dfTemp)
        
    dfJmeter=pd.concat(jMeterFrame)
    dfSkydiveFlows=pd.concat(skydiveFrame)
    dfIndex=pd.concat(indexFrame)            
    
    return(dfJmeter,dfSkydiveFlows,dfIndex)

In [76]:
## Examine multiple samples

def plotAgg2(df1,df2,yAgg1,yfeature1,yAgg2,yfeature2,xfeature,yname1,yname2,xtitle,ytitle,title):
    traces=[]
    trace = go.Scatter(
            y=df1[yAgg1][yfeature1],
            x=df1[xfeature],
            mode = 'markers',
            connectgaps=True,
            name=yname1
    )    
    traces.append(trace)
    trace = go.Scatter(
            y=df2[yAgg2][yfeature2],
            x=df2[xfeature],
            mode = 'markers',
            connectgaps=True,
            name=yname2
    )    
    traces.append(trace)
    layout = go.Layout(
            barmode='group',
            title=title,
            xaxis=dict(
                title=xtitle
                #tickvals=xtickvals
            ),
            yaxis=dict(
                title=ytitle
            )
    )

    fig = go.Figure(data=traces, layout=layout)
    iplot(fig)    

In [77]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ResourceWarning)


dfJmeter, dfSkydiveFlows, dfIndex = getDFs(["190627","190628","190703","190707","190710"]) #

                                            
#dfIndex = massageIndex(dfIndex,1,2.5)
dfIndex = massageIndex(dfIndex,0,0)

    
dfJmeterLabeled = labelDF(dfIndex,dfJmeter,'time')


dfSkydiveFlowsLabeled = labelDF(dfIndex,dfSkydiveFlows,'_source.Metric.Last')


dfJmeterLabeled = dfJmeterLabeled[dfJmeterLabeled.begin.isin(dfSkydiveFlowsLabeled.begin.unique())]


dfIndex = dfIndex[dfIndex.begin.isin(dfSkydiveFlowsLabeled.begin.unique())]







#maxMedianIndex,maxNoStressPair=maxMedian(dfIndex,dfJmeterLabeled)
#noStressBegin = dfIndex.iloc[maxNoStressPair]['begin']
#noStressEnd = dfIndex.iloc[maxNoStressPair]['end']
#withStressBegin = dfIndex.iloc[maxMedianIndex]['begin']
#withStressEnd = dfIndex.iloc[maxMedianIndex]['end'] 
#dfJmeterNoStress = dfJmeter[(dfJmeter['time'] >= noStressBegin) & (dfJmeter['time'] <= noStressEnd)] 
#dfSkydiveFlowsNoStress = dfSkydiveFlows[(dfSkydiveFlows['_source.Metric.Last'] >= noStressBegin) &  (dfSkydiveFlows['_source.Metric.Last'] <= noStressEnd)]
#dfJmeterWithStress = dfJmeter[(dfJmeter['time'] >= withStressBegin) & (dfJmeter['time'] <= withStressEnd)] 
#dfSkydiveFlowsWithStress = dfSkydiveFlows[(dfSkydiveFlows['_source.Metric.Last'] >= withStressBegin) &  (dfSkydiveFlows['_source.Metric.Last'] <= withStressEnd)] 
    

    
pd.options.mode.chained_assignment = None    
    
#dfJmeterNoStress = dfJmeter[(dfJmeter['time'] > 1554109413813) & (dfJmeter['time'] < 1554110584822)] 
#dfSkydiveFlowsNoStress = dfSkydiveFlows[(dfSkydiveFlows['_source.Metric.Last'] > 1554109413813) &  (dfSkydiveFlows['_source.Metric.Last'] < 1554110584822)]     
#dfJmeterWithStress = dfJmeter[(dfJmeter['time'] > 1554110584865) & (dfJmeter['time'] < 1554112922403)] 
#dfSkydiveFlowsWithStress = dfSkydiveFlows[(dfSkydiveFlows['_source.Metric.Last'] > 1554110584865) &  (dfSkydiveFlows['_source.Metric.Last'] < 1554112922403)] 

dfJmeterLabeled = massageJmeter(dfJmeterLabeled)
#print("dfJmeterLabeled.count()",dfJmeterLabeled.count())
dfSkydiveFlowsLabeled = massageSkydiveFlows(dfSkydiveFlowsLabeled)
#print('2')
#dfJmeterNoStress = massageJmeter(dfJmeterNoStress)    
#dfJmeterWithStress = massageJmeter(dfJmeterWithStress)

#print('3')
#dfSkydiveFlowsNoStress = massageSkydiveFlows(dfSkydiveFlowsNoStress)    
#dfSkydiveFlowsWithStress =  massageSkydiveFlows(dfSkydiveFlowsWithStress) 


In [78]:
 
pd.options.mode.chained_assignment = None

dfJmeter, dfSkydiveFlows, dfIndexTest = getDFs(["190630"])
#dfJmeter, dfSkydiveFlows, dfIndexTest = getDFs(["190630","190710"])

#dfIndex = massageIndex(dfIndex,1,2.5)
dfIndexTest = massageIndex(dfIndexTest,0,0)
    
dfJmeterLabeledTest = labelDF(dfIndexTest,dfJmeter,'time')

dfSkydiveFlowsLabeledTest = labelDF(dfIndexTest,dfSkydiveFlows,'_source.Metric.Last')

dfJmeterLabeledTest = dfJmeterLabeledTest[dfJmeterLabeledTest.begin.isin(dfSkydiveFlowsLabeledTest.begin.unique())]

dfIndexTest = dfIndexTest[dfIndexTest.begin.isin(dfSkydiveFlowsLabeledTest.begin.unique())]

  
dfJmeterLabeledTest = massageJmeter(dfJmeterLabeledTest)
dfSkydiveFlowsLabeledTest = massageSkydiveFlows(dfSkydiveFlowsLabeledTest)
testSets = []
testSets.append({"dfIndexTest": dfIndexTest,
            "dfJmeterLabeledTest": dfJmeterLabeledTest, 
            "dfSkydiveFlowsLabeledTest": dfSkydiveFlowsLabeledTest})





## QoS to QoE estimate (using machine learning)

In [79]:
dfIndexLabeledAgg = dfIndex.groupby(['stress_test']).agg({'elapse_time': ['min','max','median','mean','std']}).dropna().reset_index()

dfJmeterLabeledAgg = dfJmeterLabeled.groupby(['begin']).agg({'avg': ['min','max','median','mean','std']}).dropna().reset_index()
dfSkydiveFlowsLabeledAgg = dfSkydiveFlowsLabeled.groupby(['begin']).agg({'flow_duration': ['min','max','median','mean','std','count']}).dropna().reset_index()
dfSkydiveFlowsRTTLabeledAgg = dfSkydiveFlowsLabeled.groupby(['begin']).agg({'_source.Metric.RTT': ['min','max','median','mean','std']}).dropna().reset_index()
dfJmeterLabeledAggTest = dfJmeterLabeledTest.groupby(['begin']).agg({'avg': ['min','max','median','mean','std']}).dropna().reset_index()

dfSkydiveFlowsLabeledAggTest = dfSkydiveFlowsLabeledTest.groupby(['begin']).agg({'flow_duration': ['min','max','median','mean','std','count']}).dropna().reset_index()
dfSkydiveFlowsRTTLabeledAggTest = dfSkydiveFlowsLabeledTest.groupby(['begin']).agg({'_source.Metric.RTT': ['min','max','median','mean','std']}).dropna().reset_index()

dfSkydiveFlowsBytesPerFlowLabeledAgg = dfSkydiveFlowsLabeled.groupby(['begin']).agg({'bytes_per_flow': ['min','max','median','mean','std']}).dropna().reset_index()
dfSkydiveFlowsBytesPerFlowLabeledAggTest = dfSkydiveFlowsLabeledTest.groupby(['begin']).agg({'bytes_per_flow': ['min','max','median','mean','std']}).dropna().reset_index()
dfSkydiveFlowsABBytesPerFlowLabeledAgg = dfSkydiveFlowsLabeled.groupby(['begin']).agg({'AB_bytes_per_flow': ['min','max','median','mean','std']}).dropna().reset_index()
dfSkydiveFlowsABBytesPerFlowLabeledAggTest = dfSkydiveFlowsLabeledTest.groupby(['begin']).agg({'AB_bytes_per_flow': ['min','max','median','mean','std']}).dropna().reset_index()
dfSkydiveFlowsBABytesPerFlowLabeledAgg = dfSkydiveFlowsLabeled.groupby(['begin']).agg({'BA_bytes_per_flow': ['min','max','median','mean','std']}).dropna().reset_index()
dfSkydiveFlowsBABytesPerFlowLabeledAggTest = dfSkydiveFlowsLabeledTest.groupby(['begin']).agg({'BA_bytes_per_flow': ['min','max','median','mean','std']}).dropna().reset_index()

dfSkydiveFlowsPacketsPerFlowLabeledAgg = dfSkydiveFlowsLabeled.groupby(['begin']).agg({'packets_per_flow': ['min','max','median','mean','std']}).dropna().reset_index()
dfSkydiveFlowsPacketsPerFlowLabeledAggTest = dfSkydiveFlowsLabeledTest.groupby(['begin']).agg({'packets_per_flow': ['min','max','median','mean','std']}).dropna().reset_index()
dfSkydiveFlowsABPacketsPerFlowLabeledAgg = dfSkydiveFlowsLabeled.groupby(['begin']).agg({'AB_packets_per_flow': ['min','max','median','mean','std']}).dropna().reset_index()
dfSkydiveFlowsABPacketsPerFlowLabeledAggTest = dfSkydiveFlowsLabeledTest.groupby(['begin']).agg({'AB_packets_per_flow': ['min','max','median','mean','std']}).dropna().reset_index()
dfSkydiveFlowsBAPacketsPerFlowLabeledAgg = dfSkydiveFlowsLabeled.groupby(['begin']).agg({'BA_packets_per_flow': ['min','max','median','mean','std']}).dropna().reset_index()
dfSkydiveFlowsBAPacketsPerFlowLabeledAggTest = dfSkydiveFlowsLabeledTest.groupby(['begin']).agg({'BA_packets_per_flow': ['min','max','median','mean','std']}).dropna().reset_index()

In [80]:
testAggSets = []
for testSet in testSets:
    testAggSets.append({
        'dfIndexTest': testSet["dfIndexTest"],
        'dfJmeterLabeledAggTest' : testSet["dfJmeterLabeledTest"].groupby(['begin']).agg({'avg': ['min','max','median','mean','std']}).dropna().reset_index(),
        'dfSkydiveFlowsLabeledAggTest' : testSet["dfSkydiveFlowsLabeledTest"].groupby(['begin']).agg({'flow_duration': ['min','max','median','mean','std','count']}).dropna().reset_index(),
        'dfSkydiveFlowsRTTLabeledAggTest' : testSet["dfSkydiveFlowsLabeledTest"].groupby(['begin']).agg({'_source.Metric.RTT': ['min','max','median','mean','std']}).dropna().reset_index(),
        'dfSkydiveFlowsBytesPerFlowLabeledAggTest' : testSet["dfSkydiveFlowsLabeledTest"].groupby(['begin']).agg({'bytes_per_flow': ['min','max','median','mean','std']}).dropna().reset_index(),
        'dfSkydiveFlowsABBytesPerFlowLabeledAggTest' : testSet["dfSkydiveFlowsLabeledTest"].groupby(['begin']).agg({'AB_bytes_per_flow': ['min','max','median','mean','std']}).dropna().reset_index(),
        'dfSkydiveFlowsBABytesPerFlowLabeledAggTest' : testSet["dfSkydiveFlowsLabeledTest"].groupby(['begin']).agg({'BA_bytes_per_flow': ['min','max','median','mean','std']}).dropna().reset_index(),
        'dfSkydiveFlowsPacketsPerFlowLabeledAggTest' : testSet["dfSkydiveFlowsLabeledTest"].groupby(['begin']).agg({'packets_per_flow': ['min','max','median','mean','std']}).dropna().reset_index(),
        'dfSkydiveFlowsABPacketsPerFlowLabeledAggTest' : testSet["dfSkydiveFlowsLabeledTest"].groupby(['begin']).agg({'AB_packets_per_flow': ['min','max','median','mean','std']}).dropna().reset_index(),
        'dfSkydiveFlowsBAPacketsPerFlowLabeledAggTest' : testSet["dfSkydiveFlowsLabeledTest"].groupby(['begin']).agg({'BA_packets_per_flow': ['min','max','median','mean','std']}).dropna().reset_index()
      
    })  

In [81]:
#print("dfIndex.shape",dfIndex.shape)
#print("dfSkydiveFlowsLabeled.shape",dfSkydiveFlowsLabeled.shape)
#print("dfSkydiveFlowsLabeledAgg.shape",dfSkydiveFlowsLabeledAgg.shape)
#print("dfJmeterLabeledAgg.shape",dfJmeterLabeledAgg.shape)
#print("dfIndexLabeledAgg.shape",dfIndexLabeledAgg.shape)

In [82]:
#index_max_min = round(max(dfIndexLabeledAgg['elapse_time']['min']))
#index_max_median = round(max(dfIndexLabeledAgg['elapse_time']['median']))

dfIndexLabeledAgg = dfIndex.groupby(['stress_test']).agg({'elapse_time': ['min','max','median','mean','std']}).dropna().reset_index()
dfIndexNoStressAgg  = dfIndexLabeledAgg[dfIndexLabeledAgg["stress_test"] == " no_stress"]
indexNoStressLow = round(max(dfIndexNoStressAgg['elapse_time']['median']))
indexNoStressHigh = round(max(dfIndexNoStressAgg['elapse_time']['max']+dfIndexNoStressAgg['elapse_time']['std']*5))
plot4MsWithThreshholds(dfIndexLabeledAgg,'stress_test','elapse_time','training stress configurations','workload duration (ms)','Workload Duration (QoE)<br> workload: wp4<br>Single threshold for Binary Classification',indexNoStressHigh,indexNoStressHigh)
plot4MsWithThreshholds(dfIndexLabeledAgg,'stress_test','elapse_time','training stress configurations','workload duration (ms)','Workload Duration (QoE)<br> workload: wp4<br>Multiple thresholds for Multiclass Classification',indexNoStressLow,indexNoStressHigh)

In [83]:
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
@ignore_warnings(category=UserWarning)
@ignore_warnings(category=ConvergenceWarning)
def QoEEstimator(classifiers,X_train, y_train,X_test,y_test):

    #print("training set size",X_train.shape[0])
    #print("testing set size",y_test.shape[0])

    dfValidationScores = pd.DataFrame(columns = ['Classifier' , 'f1_score', 'accuracy_score', 'precision_score', 'recall_score'])
    yPairs = {}
    for f in classifiers:
        clf = f    
        clf_name = str(clf)[0:str(clf).index('(')]
        s = time.clock()
        model = clf.fit(X_train, y_train)
        e = time.clock()
        modelFitTime = e - s
        s = time.clock()
        y_pred = model.predict(X_test) 
        e = time.clock()
        predictionTime = e - s
        #print(metrics.confusion_matrix(y_test, y_pred))
        fScore = f1_score(y_test, y_pred,average='weighted',labels=np.unique(y_pred))
        accuracyScore = accuracy_score(y_test, y_pred)
        yPairs[clf_name] = {
            "y_test": y_test,
            "y_pred": y_pred,
            "classifier": clf_name,            
            "f1_score": fScore,
            "accuracy_score": accuracyScore,
            "predictionTime": predictionTime,
            "modelFitTime": modelFitTime
        }
   
        dfValidationScores = dfValidationScores.append({'Classifier' : clf_name , 
                                              'f1_score' : fScore,
                                              'accuracy_score' : accuracyScore,
                                              'precision_score': precision_score(y_test, y_pred,average='weighted',labels=np.unique(y_pred)),
                                              'recall_score'  : recall_score(y_test, y_pred,average='weighted',labels=np.unique(y_pred)),
                                              "predictionTime": predictionTime,         
                                              'modelFitTime'  : modelFitTime},                                                                                           
                                             ignore_index=True)                     
    dfValidationScores = dfValidationScores.sort_values(by=['f1_score','accuracy_score',"predictionTime",'modelFitTime'], ascending=False).round(2).reset_index()
    dfValidationScores.drop(["index"],axis=1,inplace=True)
    bestClassifier = dfValidationScores.at[0,'Classifier']    
    table = ff.create_table(dfValidationScores)    
    #iplot(table)
    #return(yPairs[bestClassifier])
    return(yPairs[bestClassifier], table)


In [113]:
import itertools as it
classifiers = [LogisticRegression(),DecisionTreeClassifier(max_depth=5),KNeighborsClassifier(3),
               LinearDiscriminantAnalysis(),RandomForestClassifier(max_depth=5,n_estimators=3, random_state=0),
               GaussianNB(),SVC(),MLPClassifier()]
classifiers = [LogisticRegression(),DecisionTreeClassifier(max_depth=5),KNeighborsClassifier(3),
               RandomForestClassifier(max_depth=5,n_estimators=3, random_state=0),
               GaussianNB(),SVC(),MLPClassifier()]
classifications=[
    {
        'desc':"Binary Classifications",
        'bins':[0,indexNoStressHigh,np.inf],
        'names':[1,2],
        'low': indexNoStressHigh,
        'high':indexNoStressHigh
    },
    {
        'desc':"Multiclass Classifications",
        'bins':[0,indexNoStressLow,indexNoStressHigh,np.inf],
        'names': [1,2,3], 
        'low': indexNoStressLow,
        'high':indexNoStressHigh

        
    }
]
_features = pd.DataFrame()    
_features['flow_duration_mean'] = dfSkydiveFlowsLabeledAgg['flow_duration']['mean']
_features['bytes_per_flow_mean'] = dfSkydiveFlowsBytesPerFlowLabeledAgg['bytes_per_flow']['mean'] 
_features['packets_per_flow_mean'] = dfSkydiveFlowsPacketsPerFlowLabeledAgg['packets_per_flow']['mean'] 
_features['AB_bytes_per_flow_mean'] = dfSkydiveFlowsABBytesPerFlowLabeledAgg['AB_bytes_per_flow']['mean'] 
_features['BA_bytes_per_flow_mean'] = dfSkydiveFlowsBABytesPerFlowLabeledAgg['BA_bytes_per_flow']['mean'] 
_features['AB_packets_per_flow_mean'] = dfSkydiveFlowsABPacketsPerFlowLabeledAgg['AB_packets_per_flow']['mean'] 
_features['BA_packets_per_flow_mean'] = dfSkydiveFlowsBAPacketsPerFlowLabeledAgg['BA_packets_per_flow']['mean']
_features['RTT_mean'] = dfSkydiveFlowsRTTLabeledAgg['_source.Metric.RTT']['mean']

_test_features = pd.DataFrame()    
_test_features['flow_duration_mean'] = dfSkydiveFlowsLabeledAggTest['flow_duration']['mean']
_test_features['bytes_per_flow_mean'] = dfSkydiveFlowsBytesPerFlowLabeledAggTest['bytes_per_flow']['mean'] 
_test_features['packets_per_flow_mean'] = dfSkydiveFlowsPacketsPerFlowLabeledAggTest['packets_per_flow']['mean'] 
_test_features['AB_bytes_per_flow_mean'] = dfSkydiveFlowsABBytesPerFlowLabeledAggTest['AB_bytes_per_flow']['mean'] 
_test_features['BA_bytes_per_flow_mean'] = dfSkydiveFlowsBABytesPerFlowLabeledAggTest['BA_bytes_per_flow']['mean'] 
_test_features['AB_packets_per_flow_mean'] = dfSkydiveFlowsABPacketsPerFlowLabeledAggTest['AB_packets_per_flow']['mean'] 
_test_features['BA_packets_per_flow_mean'] = dfSkydiveFlowsBAPacketsPerFlowLabeledAggTest['BA_packets_per_flow']['mean']
_test_features['RTT_mean'] = dfSkydiveFlowsRTTLabeledAgg['_source.Metric.RTT']['mean']


print("Do ML Classification")
print("training set size",_features.shape[0])
print("testing set size",_test_features.shape[0])
print("Examine all combinations of the features listed below:")
for f in list(_features.columns):
    print("      ",f)

for classification in classifications:
    print("")
    print("")
    print("{}".format(classification["desc"]))    
    dfCombinationScores = pd.DataFrame(columns = ['Features','Classifier','f1_score','accuracy_score'])
    bestClassifiers = []
    qoeTables=[]
    bins = classification['bins']
    names = classification['names']
    targets = pd.cut(dfIndex['elapse_time'], bins,labels=names)
    test_targets = pd.cut( dfIndexTest['elapse_time'], bins,labels=names)
    #for i in range(1,_features.shape[1]+1):
    for i in range(1,5):
        for comb in it.combinations(list(_features.columns),i):        
            features = pd.DataFrame()
            test_features = pd.DataFrame()
            for f in comb:
                features[f] = _features[f]
                test_features[f] = _test_features[f]
            X_train, X_test, y_train, y_test = train_test_split(features,targets, test_size=0.0, random_state=0)
            bestClassifier, qoeTable = QoEEstimator(classifiers,X_train, y_train,test_features,test_targets)
            bestClassifiers.append(bestClassifier.copy())
            qoeTables.append(qoeTable.copy())
            dfCombinationScores = dfCombinationScores.append({
                'Features' : str(comb).replace(",","<br>"),
                'Classifier' : bestClassifier["classifier"], 
                'f1_score' :  bestClassifier["f1_score"],
                'accuracy_score' :  bestClassifier["accuracy_score"]
            },ignore_index=True) 
    dfCombinationScores = dfCombinationScores.sort_values(by=['f1_score','accuracy_score'], ascending=False)
    bestIndex = dfCombinationScores.index[0]
    bestClassifier = bestClassifiers[bestIndex]
    bestFeatures = dfCombinationScores.Features[bestIndex]
    qoeTable = qoeTables[bestIndex]
    table = ff.create_table(dfCombinationScores.head(20),height_constant=80)
    print("")
    print("The table below shows the first 20 highest scoring {}".format(classification["desc"]))
    iplot(table)
    print("")
    print("The figure below illustates the best perfoming classifier from the {}".format(classification["desc"]))
    title = "QoE from QoS Actual vs Prediction<br> QoE Target: workload duration, QoS features: {} <br>classifier: {}, f1_score: {}, accuracy: {}".format(bestFeatures.replace("<br>",","),bestClassifier["classifier"],str(round(bestClassifier["f1_score"],2)),str(round(bestClassifier["accuracy_score"],2)))
    plotModelEstimates(bestClassifier["y_test"],bestClassifier["y_pred"],'testing samples','QoE Classifications',title,names)
    print("")
    print("The table below compares other {} using the same QoS features, i.e. {}".format(classification["desc"],bestFeatures.replace("<br>",",")))          
    iplot(qoeTable)
    
          


    
    
        
        
    


Do ML Classification
training set size 326
testing set size 45
Examine all combinations of the features listed below:
       flow_duration_mean
       bytes_per_flow_mean
       packets_per_flow_mean
       AB_bytes_per_flow_mean
       BA_bytes_per_flow_mean
       AB_packets_per_flow_mean
       BA_packets_per_flow_mean
       RTT_mean


Binary Classifications

The table below shows the first 20 highest scoring Binary Classifications



The figure below illustates the best perfoming classifier from the Binary Classifications



The table below compares other Binary Classifications using the same QoS features, i.e. ('flow_duration_mean',)




Multiclass Classifications

The table below shows the first 20 highest scoring Multiclass Classifications



The figure below illustates the best perfoming classifier from the Multiclass Classifications



The table below compares other Multiclass Classifications using the same QoS features, i.e. ('flow_duration_mean', 'packets_per_flow_mean', 'BA_bytes_per_flow_mean', 'AB_packets_per_flow_mean')


In [114]:
#save for now, it might be usefule
classifiers = [LogisticRegression(),DecisionTreeClassifier(max_depth=5),KNeighborsClassifier(3),
               LinearDiscriminantAnalysis(),RandomForestClassifier(max_depth=5,n_estimators=3, random_state=0),
               GaussianNB(),SVC(),MLPClassifier()]
classifications=[
    {
        'desc':"Binary Classification",
        'bins':[0,indexNoStressHigh,np.inf],
        'names':[1,2],
        'low': indexNoStressHigh,
        'high':indexNoStressHigh
    },
    {
        'desc':"Multiclass Classification",
        'bins':[0,indexNoStressLow,indexNoStressHigh,np.inf],
        'names': [1,2,3], 
        'low': indexNoStressLow,
        'high':indexNoStressHigh

        
    }
]
features = pd.DataFrame()
features['flow_duration_mean']= dfSkydiveFlowsLabeledAgg['flow_duration']['mean']
features['bytes_per_flow_mean'] = dfSkydiveFlowsBytesPerFlowLabeledAgg['bytes_per_flow']['mean'] 
features['packets_per_flow_mean'] = dfSkydiveFlowsPacketsPerFlowLabeledAgg['packets_per_flow']['mean']
#print("training set size",features.shape[0])
#print("testing set size",testAggSet[0].shape[0])
#print('features',list(features.columns))
for classification in classifications:
    #plot4MsWithThreshholds(dfIndexLabeledAgg,'stress_test','elapse_time',
    #                       'training stress configurations','workload duration (ms)',
    #                       'Workload Duration (QoE)<br> workload: wp4',
    #                       classification['low'],classification['high'])
    #print(classification['desc'])
    bins = classification['bins']
    names = classification['names']
    targets = pd.cut(dfIndex['elapse_time'], bins,labels=names)

    X_train, X_test, y_train, y_test = train_test_split(features,targets, test_size=0.0, random_state=0)
    i = 0
    for testAggSet in testAggSets:
        test_features = pd.DataFrame()
        test_targets = pd.DataFrame()
        test_targets = pd.cut( testAggSet["dfIndexTest"]['elapse_time'], bins,labels=names)
        test_features['flow_duration_mean']= testAggSet["dfSkydiveFlowsLabeledAggTest"]['flow_duration']['mean']
        test_features['bytes_per_flow_mean'] = testAggSet["dfSkydiveFlowsBytesPerFlowLabeledAggTest"]['bytes_per_flow']['mean'] 
        test_features['packets_per_flow_mean'] = testAggSet["dfSkydiveFlowsPacketsPerFlowLabeledAggTest"]['packets_per_flow']['mean']
        i = i+1
        #print("evaluate test set ",i)
        #bestClassifier, table = QoEEstimator(classifiers,X_train, y_train,test_features,test_targets)
        #iplot(table)
        title = "QoE from QoS <br>Actual vs Prediction for workload elapse time<br>classifier: "+bestClassifier["classifier"]+", f1_score: "+str(round(bestClassifier["f1_score"],2))
        #plotModelEstimates(bestClassifier["y_test"],bestClassifier["y_pred"],'testing samples','QoE Classifications',title,names)




In [115]:
plot4Ms(dfJmeterLabeledAgg,'begin','avg','training samples','response time (ms)','HTTP Non-Persistent Connection <br> Wordpress Response Time (QoE)')

In [116]:
#dfSkydiveFlowsLabeledAgg = dfSkydiveFlowsLabeled.groupby(['begin','stress_test']).agg({'flow_duration': ['min','max','median','mean','std']}).dropna().reset_index()

plot4Ms(dfSkydiveFlowsLabeledAgg,'begin','flow_duration','training samples','flow duration (ms)','HTTP Non-Persistent Connection <br> Wordpress Flow Duration (QoS)')
#print(dfSkydiveFlowsLabeledAgg)
#dfSkydiveFlowsLabeledAgg['flow_duration']['median'].plot(kind='line',title='median flow duration (QoS)')

In [117]:
#print(dfSkydiveFlowsLabeled['_source.Metric.RTT'].head(10))


plot4Ms(dfSkydiveFlowsRTTLabeledAgg,'begin','_source.Metric.RTT','training samples','rtt','HTTP Non-Persistent Connection <br> Wordpress RTT (QoS)')
#print(dfSkydiveFlowsLabeledAgg)
#dfSkydiveFlowsLabeledAgg['flow_duration']['median'].plot(kind='line',title='median flow duration (QoS)')
print(dfSkydiveFlowsRTTLabeledAgg)

             begin _source.Metric.RTT                                      \
                                  min          max   median          mean   
0    1561533327525               7289        60487  32780.0  3.172244e+04   
1    1561533626922              22135       255124  64055.0  1.033121e+05   
2    1561536408767              10720        51774  30980.0  3.137581e+04   
3    1561536680176               9749    459331460  32470.0  5.090407e+07   
4    1561537487919               9518    143856265  32342.5  9.016497e+06   
5    1561537769687              10612        35096  31450.5  2.735969e+04   
6    1561540020595              20997        39038  30703.5  2.968350e+04   
7    1561540296804              12455    196665665  32141.0  2.134413e+07   
8    1561540898132              11858        41942  28447.0  2.691925e+04   
9    1561541194579              15296    978039894  30146.0  6.115389e+07   
10   1561544554326              10131        43898  30390.0  2.977150e+04   

In [118]:

plot4Ms(dfJmeterLabeledAggTest,'begin','avg','testing samples','service completion time (ms)','Measured QoE (service completion time)')

In [119]:

plot4Ms(dfSkydiveFlowsLabeledAggTest,'begin','flow_duration','testing samples','flow duration (ms)','Measured QoS (flow duration)')

In [120]:

plot4Ms(dfSkydiveFlowsRTTLabeledAggTest,'begin','_source.Metric.RTT','testing samples','RTT','HTTP Non-Persistent Connection <br> Wordpress Flow Duration (QoS)')

In [121]:
dfJmeterLabeledAgg1 = dfJmeterLabeled.groupby(['stress_test']).agg({'avg': ['min','max','median','mean','std']}).dropna()
print(dfJmeterLabeledAgg1)
#dfJmeterLabeledAgg1['avg']['median'].plot(kind='line',title='median avg resonse time (QoE)')

              avg                                         
              min    max median          mean          std
stress_test                                               
 no_stress   4320  34131   8532  13253.058773  7899.867839
iperf-t10    4321  33207  11451  13305.520497  6633.226549
iperf-t20    4320  32414  12412  14346.813028  7184.019510
iperf-t30    4320  33559  13218  14999.316294  7712.550153
iperf-t40    4320  34103  13013  14638.675132  7564.205007
iperf-t50    4320  32904  14231  15498.743265  6799.509502
iperf-t60    4695  22548   8759  13477.000000  7079.736818
r30g30       6760  30616   9472  11793.000000  7884.271032


In [122]:
dfSkydiveFlowsLabeledAgg1 = dfSkydiveFlowsLabeled.groupby(['stress_test']).agg({'flow_duration': ['min','max','median']}).dropna()
print(dfSkydiveFlowsLabeledAgg1)
#dfSkydiveFlowsLabeledAgg1['flow_duration']['median'].plot(kind='line',title='median flow duration (QoS)')

            flow_duration               
                      min     max median
stress_test                             
 no_stress           4316   52693  12221
iperf-t10            4316   68062  13376
iperf-t20            4316  702754  14862
iperf-t30            2811  108826  15393
iperf-t40            4316  640930  15288
iperf-t50            4317   51428  16280
iperf-t60            4692   31016  13135
r30g30               6756   49765  14669


In [123]:
#plotAgg2(dfJmeterLabeledAgg,dfSkydiveFlowsLabeledAgg,'avg','median','flow_duration','median','begin','median avg response time','median flow duration (QoS)','begin','millisecond','HTTP Non-Persistent Connection (multiple sample runs)<br> median avg response time (QoE) and flow duration (Qos)' )

In [124]:
#plotAgg2(dfJmeterLabeledAgg1.reset_index(),dfSkydiveFlowsLabeledAgg1.reset_index(),'avg','median','flow_duration','median','begin','median avg response time','median flow duration (QoS)','begin','millisecond','median avg response time (QoE) and flow duration (Qos)' )

## correlation on aggregations of avg response time (QoE) and flow duration (Qos) (merge based on begin time for multiple samples)

In [125]:
df1 = dfJmeterLabeledAgg
df2 = dfSkydiveFlowsLabeledAgg
#df1 = df1[['time','avg']]
#df2 = df2[['_source.Metric.Last','flow_duration']]
dfMerged = pd.merge(df1,df2,left_on='begin',right_on='begin').dropna()
#df = df[df['flow_duration'] != 0]
#print(df.head())
#print(df.count())
#print(df['avg']['median'].corr(df['flow_duration']['median'],method='spearman'))
print("median correlation",dfMerged['avg']['median'].corr(dfMerged['flow_duration']['median'],method='pearson'))
print("mean correlation",dfMerged['avg']['mean'].corr(dfMerged['flow_duration']['mean'],method='pearson'))
print("min correlation",dfMerged['avg']['min'].corr(dfMerged['flow_duration']['min'],method='pearson'))
print("max correlation",dfMerged['avg']['max'].corr(dfMerged['flow_duration']['max'],method='pearson'))
print("std correlation",dfMerged['avg']['std'].corr(dfMerged['flow_duration']['std'],method='pearson'))

median correlation 0.59446115512



dropping on a non-lexsorted multi-index without a level parameter may impact performance.



mean correlation 0.526729281467
min correlation 0.26081491765
max correlation 0.0783243358334
std correlation 0.0602274048755


In [126]:
#print(dfSkydiveFlowsRTTLabeledAgg.head(10))
#columns = ['_source.Metric.RTT']
#dfRTT = dfSkydiveFlowsRTTLabeledAgg['_source.Metric.RTT'].drop(columns,axis=1)
#print(dfRTT)


## Debugging data
* 

In [127]:
#for b in dfJmeterLabeled.begin.unique():
#    stress = dfIndex[dfIndex.begin == b].stress_test
#    plotJmeter(dfJmeterLabeled[dfJmeterLabeled.begin == b],"wordpress response time (QoE) sample "+str(b)+" ("+stress+")")
#    plotSkydiveFlows(dfSkydiveFlowsLabeled[dfSkydiveFlowsLabeled.begin == b],"wordpress flow duration (QoS) sample "+str(b)+" ("+stress+")","flow_duration","flow duration (ms)")

   

In [128]:
#print(dfSkydiveFlowsLabeled.count())
#print(dfSkydiveFlowsLabeled['_source.LastUpdateMetric.ABBytes'].unique())
#print(dfSkydiveFlowsLabeled['_source.LastUpdateMetric.BABytes'].unique())
#todo have to merge these for flowduration
vc = dfSkydiveFlowsLabeled["_source.UUID"].value_counts()
print(dfSkydiveFlowsLabeled.count())
print( vc[vc > 1])
dfSkydiveFlowsLabeled[dfSkydiveFlowsLabeled["_source.UUID"] == 'd0ce31611f3d1317' ]
print(dfSkydiveFlowsLabeled[["_source.UUID",'_source.Metric.Start','_source.Metric.Last','flow_duration']])
print(dfSkydiveFlowsLabeled[dfSkydiveFlowsLabeled["_source.UUID"] == 'd0ce31611f3d1317' ][["_source.UUID",'_source.Metric.Start','_source.Metric.Last','flow_duration']])
#print(dfSkydiveFlowsLabeled[[dfSkydiveFlowsLabeled["_source.UUID"] == 'd0ce31611f3d1317' ]["_source.UUID",'_source.Metric.Start','_source.Metric.Last','flow_duration']])

Unnamed: 0                            9651
_id                                   9651
_index                                9651
_score                                9651
_source.Application                   9651
_source.FinishType                    9588
_source.ICMP.Code                        0
_source.ICMP.Type                        0
_source.IPMetric.FragmentErrors       9651
_source.IPMetric.Fragments            9651
_source.L3TrackingID                  9651
_source.Last                          9651
_source.LastUpdateMetric.ABBytes      9651
_source.LastUpdateMetric.ABPackets    9651
_source.LastUpdateMetric.BABytes      9651
_source.LastUpdateMetric.BAPackets    9651
_source.LastUpdateMetric.Last         9651
_source.LastUpdateMetric.RTT            63
_source.LastUpdateMetric.Start        9651
_source.LayersPath                    9651
_source.Link.A                        9651
_source.Link.B                        9651
_source.Link.ID                       9651
_source.Lin