### One Hot Encoding with Ensemble Selection

### Data: Internet Firewall Data Set

### Data Source: https://www.kaggle.com/tunguz/internet-firewall-data-set

In [39]:
import pandas as pd
import numpy as np

In [40]:
df = pd.read_csv('Datasets/internet_firewall.csv')

In [41]:
df.head()

Unnamed: 0,SourcePort,DestinationPort,NATSourcePort,NATDestinationPort,Action,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
0,57222,53,54587,53,allow,177,94,83,2,30,1,1
1,56258,3389,56258,3389,allow,4768,1600,3168,19,17,10,9
2,6881,50321,43265,50321,allow,238,118,120,2,1199,1,1
3,50553,3389,50553,3389,allow,3327,1438,1889,15,17,8,7
4,50002,443,45848,443,allow,25358,6778,18580,31,16,13,18


### Using only the first seven columns

In [43]:
df = pd.read_csv('Datasets/internet_firewall.csv', usecols=['SourcePort', 'DestinationPort', 'NATSourcePort', 'NATDestinationPort', 'Packets'])

In [44]:
df.head()

Unnamed: 0,SourcePort,DestinationPort,NATSourcePort,NATDestinationPort,Packets
0,57222,53,54587,53,2
1,56258,3389,56258,3389,19
2,6881,50321,43265,50321,2
3,50553,3389,50553,3389,15
4,50002,443,45848,443,31


### Searching unique categories 
### col.unique but looping through the columns

In [45]:
for col in df.columns:
    print(col, ': ', len(df[col].unique()), ' labels')

SourcePort :  22724  labels
DestinationPort :  3273  labels
NATSourcePort :  29152  labels
NATDestinationPort :  2533  labels
Packets :  1116  labels


### Checking what no. of cols we will get if we encode 

In [46]:
pd.get_dummies(df, drop_first=True).shape

(65532, 5)

5

## Ensemble Selection

In [47]:
df.Packets.value_counts()

1        29829
2        16629
4         1852
6         1078
19         737
         ...  
274          1
242          1
12500        1
1873         1
9515         1
Name: Packets, Length: 1116, dtype: int64

In [49]:
df.Packets.value_counts().sort_values(ascending=False).head(10)

1     29829
2     16629
4      1852
6      1078
19      737
10      717
18      591
20      580
16      576
17      543
Name: Packets, dtype: int64

### Taking top 10 values that are most frequent

In [51]:
top_10 = [x for x in df.Packets.value_counts().sort_values(ascending=False).head(10).index]
top_10

[1, 2, 4, 6, 19, 10, 18, 20, 16, 17]

### Fetch the top_10 data
### replace with 1 if its the same col from top_10 else replace with 0

In [58]:
for label in top_10:
    df[label] = np.where(df['Packets']==label,1,0)
    
df[['Packets']+top_10].head(30)

Unnamed: 0,Packets,1,2,4,6,19,10,18,20,16,17
0,2,0,1,0,0,0,0,0,0,0,0
1,19,0,0,0,0,1,0,0,0,0,0
2,2,0,1,0,0,0,0,0,0,0,0
3,15,0,0,0,0,0,0,0,0,0,0
4,31,0,0,0,0,0,0,0,0,0,0
5,21,0,0,0,0,0,0,0,0,0,0
6,6,0,0,0,1,0,0,0,0,0,0
7,23,0,0,0,0,0,0,0,0,0,0
8,1,1,0,0,0,0,0,0,0,0,0
9,31,0,0,0,0,0,0,0,0,0,0
