#**Homework**

Complete the following tasks:

* Use a dataset of 21 Video Sessions
* Recognize the Video Server(s) IP and select video traffic (***if more than one Server is found, keep the dominant flow only***)
* Detect Video Client HTTP Requests (Uplink packets with size larger or equal to 100 Bytes)
* Compute features to predict:
 1.   When the next UL Request is sent by the Video Client 
 2.   How large is the response of the Server to the next UL Request

**N.B.**: Below, you can find a list of useful functions for the tasks at hand (introduced during class).

In [99]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

### Functions Ready-To-Use

In [100]:
def filter_traffic(data, domain):
   
    # Look in DNS Responses for googlevideo domain
    dns_data = data[data['Protocol']=='DNS']
    dns = dns_data[dns_data['Info'].apply(lambda x: 'googlevideo' in x and 'response' in x)]
    ips = dns.Address.values 
    server_names = dns.Name.values
    
    # Filtering on either "Source" or "Destination" IP, get the 
    # rows of the dataset that contain at least one of the selected IPs
    downlink = data[data['Source'].apply(lambda x: x in ips)].dropna(subset=['Length']) 

    uplink = data[data['Destination'].apply(lambda x: x in ips)].dropna(subset=['Length'])
    
    return ips, server_names, uplink, downlink

def find_dominant(uplink, downlink):

  # Expressed in MB

  # Order flows by cumulative DL Volume
  flows_DL = downlink.groupby(['Source','Destination'])['Length'].sum()/(10**6)
  print(flows_DL)
  
  # Get (Source,Destination) IPs of dominant flows
  dom_id = flows_DL[flows_DL==max(flows_DL)].index[0]

  # Filter traffic selecting the dominant flow
  dom_dl = downlink[downlink['Source']==dom_id[0]]
  dom_ul = uplink[(uplink['Source']==dom_id[1])]

  return dom_ul, dom_dl

def timebased_filter(data, length=None, min_time=None, max_time=None):
  '''
  :param data: pd dataframe to be filtered. Must contain columns: "Length" and "Time"
  :param length: all packets shorter than length [Bytes] will be discarded (default 0)
  :param min_time: all packets with timestamp smaller than min_time [s] will be discarded (default 0)
  :param max_time: all packets with timestamp larger than max_time [s] will be discarded (default 1000)
  '''

  if length is None:
    length=0
  if min_time is None:
    min_time = 0
  if max_time is None:
    max_time = 1000
  
  filtered_data = data.copy().reset_index()
  mask = (filtered_data['Length']>=length) & (filtered_data['Time']>=min_time) & (filtered_data['Time']<= max_time)
  filtered_data = filtered_data.loc[mask[mask ==True].index]

  return filtered_data

def find_next(array, value):
  '''
  :param array: np.array, array of floats
  :param value: float, reference value
  :return: position of the closest element of the array greater than "value"
  '''
  delta = np.asarray(array) - value
  idx = np.where(delta >= 0, delta, np.inf).argmin()

  return idx

def normalize_dataset(training_set, test_set):

  mean_train = training_set.mean()
  std_train = training_set.std()
  norm_train = (training_set - mean_train)/std_train
  norm_test = (test_set - mean_train)/std_train  

  return norm_train, norm_test

### Functions to be completed


In [101]:
def features_extraction(uplink, downlink):
  '''
  Complete this function to extract both features and groundtruth.

  NB: The features extraction process is the same as the one introduced during
  the lecture. 
  '''
  dataset = pd.DataFrame(columns=['Request_Size','Inter_RR_Time','DL_Time','DL_Vol','DL_Size','PB_Time'])
  # ****************************************************************************
  # Feature 1: Client Request Size(Size of Last HTTP request)
  dataset['Request_Size'] = list(uplink.Length.values)
  
  # ****************************************************************************
  # Feature 2: Inter Request-Response Time
  rr_time = []
  response_time = []
  for t in uplink.Time:
    response_time.append(find_next(downlink.Time, t)) #index of next DL packet timestamp 
    rr_time.append(downlink.Time.iloc[response_time[-1]] - t)

  dataset['Inter_RR_Time'] = rr_time

  # ****************************************************************************
  # Feature 3-4-5: Download Time, Download Volume, Download Size (# Packets) 
  dt = []
  dv = []
  ds = []
  
  # Needed two structures in which volumes and times will be added
  next_burst_size_list = []
  next_request_time_list = []
 
  for rt1, rt2 in zip(response_time[:-1], response_time[1:]):

    #Download Time
    dt.append(downlink.Time.iloc[rt2-1] - downlink.Time.iloc[rt1])

    temp = timebased_filter(downlink, 0, downlink.Time.iloc[rt1], downlink.Time.iloc[rt2-1])
    
    #Download Volume
    dv.append(temp.Length.sum())

    #a list to be used to populate groundtruth dataframe
    next_burst_size_list.append(dv[-1])
    #I have applied the find next function based on the hint given 
    next_request_time_list.append(uplink.Time.iloc[find_next(uplink.Time, downlink.Time.iloc[rt2-1])] - downlink.Time.iloc[rt2-1])
    
    #Download Size (# Packets) 
    ds.append(temp.shape[0])

  # Last Iteration data might be corrupted due to drastic interruption of capture 
  # process. If it is so, an error would occur during the features extraction.
  # To avoid this, we skip last HTTP iteration data when an error is raised 
  # using the try-except logic below.
  try:
    # Consider also last HTTP iteration
    #Download Time
    dt.append(downlink.Time.iloc[-1] - downlink.Time.iloc[rt2])

    temp = timebased_filter(downlink, 0, downlink.Time.iloc[rt2], downlink.Time.iloc[-1])
    #Download Volume
    dv.append(temp.Length.sum())

    #Download Size (# Packets) 
    ds.append(temp.shape[0])
  except:
    print()

  dataset['DL_Time'] = dt
  dataset['DL_Vol'] = dv
  dataset['DL_Size'] = ds

  # ****************************************************************************
  # Feature 5: Playback Time
  pbt = list(uplink.Time.values)
  dataset['PB_Time'] = pbt
  # ****************************************************************************
  
  
  # Check Features Consistency
  dataset = dataset[(dataset > 0).all(1)]
  dataset = dataset[dataset['DL_Time']<20]

  ###############################################################
  # TO BE COMPLETED

  ### EXTRACT GROUNDTRUTH HERE
  groundtruth = pd.DataFrame(columns=['Next_Request_Time','Next_Response_Vol'])
  # ****************************************************************************
  # GT 1: Next Request Time
  #The arrival time of the next HTTP Request is the time between the last DL packet and the next “large” UL packet
  groundtruth['Next_Request_Time'] = next_request_time_list
  # ****************************************************************************
  # GT 2: Next Response Volume
  #his groundtruth comes from free from "Server Burst Volume" feature…
  groundtruth['Next_Response_Vol'] = next_burst_size_list

  ###############################################################


  # Check Ground Truth Consistency
  groundtruth.dropna(inplace=True)

  # Align Dataset and Groundtruth
  intersection = set(dataset.index).intersection(set(groundtruth.index))
  dataset = dataset.loc[intersection,:]
  groundtruth = groundtruth.loc[intersection,:]
 

  return dataset, groundtruth

### Write your code here



Google Drive is uploaded in order to select all the capture files.

In [102]:
from google.colab import drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [103]:
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd


In these first lines of code, a for loop is used to iterate over all the files contained in the '/content/gdrive/MyDrive/homework3' path.
Then only the relevant traffic is considered and after found the dominant up and dowlink traffic the dataframes are populated.

In [6]:
path = '/content/gdrive/MyDrive/homework3'
tcpdumpfiles = [f for f in listdir(path) if isfile(join(path, f))]
gt= np.unique([f.split('.')[0] for f in tcpdumpfiles])

uplink = pd.DataFrame()
downlink = pd.DataFrame()

for f in tcpdumpfiles:
  capture = pd.read_csv(join(path,f))
  
  domain_name = 'googlevideo'

  #filter only relevant traffic
  ips, server_names, uplink, downlink = filter_traffic(capture,domain_name)

  # for each capture consider just the dominant traffic flows
  dom_ul, dom_dl = find_dominant(uplink, downlink)
  
  #applying the features extraction function
  dataset, groundtruth = features_extraction(dom_ul, dom_dl)

  #print(dataset)

Source          Destination
74.125.163.138  192.168.1.6    0.008266
74.125.99.91    192.168.1.6    0.008056
Name: Length, dtype: float64
Source          Destination
74.125.104.103  192.168.1.6     0.012088
74.125.111.106  192.168.1.6     0.015628
74.125.154.138  192.168.1.6     1.054021
91.81.217.140   192.168.1.6    17.478906
Name: Length, dtype: float64
Source          Destination
74.125.111.105  192.168.1.6    7.992543
74.125.99.105   192.168.1.6    0.009947
91.81.217.140   192.168.1.6    2.965944
Name: Length, dtype: float64
Source         Destination
74.125.153.24  192.168.1.6    4.127591
Name: Length, dtype: float64
Source          Destination
74.125.111.106  192.168.1.6     2.080363
91.81.217.141   192.168.1.6    32.572815
Name: Length, dtype: float64
Source           Destination
173.194.160.200  192.168.1.6    0.839776
173.194.187.71   192.168.1.6    1.781121
74.125.105.10    192.168.1.6    0.012088
91.81.217.140    192.168.1.6    8.902818
Name: Length, dtype: float64
Source   

# Regression Task

In this part of the homework is achieved the regression task on the two continuos quantities 'Next_Request_Time' and 'Next_Response_Vol'. 
It is used a Random Forest regressor using as criterion the minimum square error.
The procedure followed was to split the dataframe into two subsets and on those distinctly is applied the regressor.




In [104]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import KFold
import random
import pandas as pd
import math


kf = KFold(n_splits=5)
random_forest_Regressor = RandomForestRegressor(n_estimators=1,criterion = 'mse')

#list used to calculate the minimum mean square error
mse_nrt_list = []
mse_nrv_list = []

#splitting the groundtruth into two dataframes used to find the mean square errors
nrt = groundtruth['Next_Request_Time']
nrv = groundtruth['Next_Response_Vol']


**Training and prediction of the next http request**

In [105]:
for train_index,test_index in kf.split(dataset):

  #defining train and test sets
  data_train,data_test = dataset.iloc[train_index,:], dataset.iloc[test_index,:]
  labels_train,labels_test = nrt.iloc[train_index], nrt.iloc[test_index]
  
  #normalizing train and test sets  
  norm_train, norm_test = normalize_dataset(data_train,data_test)

  #applying regression alg
  random_forest_Regressor.fit(norm_train,labels_train)

  mse_nrt = random_forest_Regressor.predict(norm_test)

  mse_nrt_list.append(metrics.mean_squared_error(labels_test,mse_nrt))


#RMSE ARRIVAL TIME NEXT HTTP REQUEST

In [106]:
print('The mse obtained on the arrival time of the next http request is: ')
print("%.3f" % math.sqrt(metrics.mean_squared_error(labels_test,mse_nrt)*1000), 'ms')


The mse obtained on the arrival time of the next http request is: 
0.003 ms


**Training and prediction of the next burst volume**

In [107]:
for train_index,test_index in kf.split(dataset):

  #defining train and test sets
  data_train2,data_test2 = dataset.iloc[train_index,:], dataset.iloc[test_index,:]
  labels_train2,labels_test2 = nrv.iloc[train_index], nrv.iloc[test_index]
  
  #normalizing train and test sets  
  norm_train2, norm_test2 = normalize_dataset(data_train2,data_test2)
  
  #applying regression alg
  random_forest_Regressor.fit(norm_train2,labels_train2)

  mse_nrv = random_forest_Regressor.predict(norm_test2)

  mse_nrv_list.append(metrics.mean_squared_error(labels_test2,mse_nrv))

  

#RMSE NEXT BURST VOLUME

In [108]:
print('The mse obtained on the volume for the next burst is: ')
print("%.3f" % math.sqrt(metrics.mean_squared_error(labels_test2,mse_nrv)),'Kb')


The mse obtained on the volume for the next burst is: 
328.086 Kb
