In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt  
%matplotlib inline

## 1. Introduce the Data

### Import the dataset

`dataset.txt` is a tab delimited file that has around 5 lac rows and 24 features. We read the data into an object called df and then assign the labels of the features to their respective columns in the dataframe `df`.

In [2]:
df = pd.read_csv('dataset.txt', sep = "\t", header = None) #use nrows attribute for limited dataset
column_list = [
    'duration',
    'service',
    'source_bytes',
    'destination_bytes',
    'count',
    'same_srv_rate',
    'serror_rate',
    'srv_serror_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_src_port_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'flag',
    'ids_detection',
    'malware_detection',
    'ashula_detection',
    'label',
    'source_ip_address',
    'source_port_number',
    'destination_ip_address',
    'destination_port_number',
    'start_time',
    'protocol'
]
df.columns = column_list

FileNotFoundError: File b'dataset.txt' does not exist

In [None]:
#reading the first 5 rows of the data

df.head()

In [None]:
#shape of the data is the number of rows by the number of features
df.shape

### Remove the target variable from the dataset
the target variable will be `label`

 **Important -- **
 `label = 0 if no Intrusion 1 Otherwise`

although `malware detection`, `ids_detection`, and `ashula_detection` could also be taken as labels, we decided against it since they were only indicative of the detection of intrusion by a software and therefore may not be correctly labelled.

In [None]:
df['label'] = [0 if x == 1 else 1 for x in df['label']]

In [None]:
label_target = df.pop('label').values

#dropping these labels because they aren't really needed in training
df.pop('ids_detection').values
df.pop('malware_detection').values
df.pop('ashula_detection').values

df.shape

In [None]:
#creating a variable y that contains the target column `label` for the training set
y = label_target

### Checking String Based Features

Here we check the unique values of string based features and determine if they are necessary. If there are a lot of unique values then the feature is ignored since it might lead to overfitting of the model

In [None]:
#count the unique values in service feature(string based)
service_value_counts = df['service'].value_counts()
print("Number of unique values = ", service_value_counts.shape[0], "\n")
print(service_value_counts)

In [None]:
#count the unique values in protocol feature(string based)
protocol_value_counts = df['protocol'].value_counts()
print("Number of unique values = ", protocol_value_counts.shape[0], "\n")
print(protocol_value_counts)

In [None]:
#count the unique values in flag feature(string based)
protocol_value_counts = df['flag'].value_counts()
print("Number of unique values = ", protocol_value_counts.shape[0], "\n")
print(protocol_value_counts)

### Unique values for each feature in the dataset

In [None]:
df.T.apply(lambda x: x.nunique(), axis=1)

### Removing unnecessary features
Everything from source_ip_address to start_time is of no use because these things are really random...

In [None]:
df.pop('source_ip_address').values
df.pop('source_port_number').values
df.pop('destination_ip_address').values
df.pop('destination_port_number').values
df.pop('start_time').values
df.head()

### Features to use

In [None]:
list(df)

### What the data looks like now in terms of data type

In [None]:
df.info()

### Transfrom Catergorical Data to Numerical Data

From the above result it can be seen that `service`, `flag`, and `protocol` are not numeric data. Since we have to supply the machine learing models with numeric data we have to somehow transform the categorical data to numeric data.

For this purpose we use a Label Encoder that encodes the unique values of a feature to a unique numeric constant(number). We do this encoding for all the rows in the data.

In [None]:
from sklearn import preprocessing

In [None]:
#get the unique values for the following categorical data
categorical_data = ['service', 'flag', 'protocol']
unique_flag_data = df['flag'].unique()
unique_service_data = df['service'].unique()
unique_protocol_data = df['protocol'].unique()

#### Encoder for feature : Flag

This shows an example of how categorical data like flag may be encoded and then decoded from categorical to numeric and then from numeric to categorical respectively.

In [None]:
le_flag = preprocessing.LabelEncoder()
#Fit the label encoder to unique values
le_flag.fit(unique_flag_data)

#Fit the label data to some example data
example_flag_data = list(df.head()['flag'])
#Fit the label encoder and return encoded labels
encoded_flag_data = le_flag.transform(example_flag_data)

#Transform labels back to original encoding
decoded_flag_data = list(le_flag.inverse_transform(encoded_flag_data))

print(example_flag_data)
print(encoded_flag_data)
print(decoded_flag_data)
#Ignore any warnings

#### Encoder for feature : service and protocol

In [None]:
le_service = preprocessing.LabelEncoder()
le_service.fit(unique_service_data)

le_protocol = preprocessing.LabelEncoder()
le_protocol.fit(unique_protocol_data)

#### Encode the categorical features for all rows in the data

In [None]:
df['flag'] = le_flag.transform(df['flag'])
df['service'] = le_service.transform(df['service'])
df['protocol'] = le_protocol.transform(df['protocol'])
df.head()

### PCA

PCA stands for Principal Component Analysis. This algorithm is used for dimensionality reduction.
The algorithm is supplied with the number of dimension to output and then the PCA algorithm automatically calculates the new dimensions from the old dimensions. New dimensions are really a linear combination of the old dimensions.

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
X_pca = pd.DataFrame(pca.fit_transform(df))

In [None]:
X_pca.head()

## Model Building

We have used 3 different types of machine learning models which are:

    -KMeans
    -Logistic Regression
    -Random Forests
    
Here KMeans is a Unsupervised Learning model and the other 2 are Supervised Learning Models.

### Kmeans

Here we use 2 clusters because we want the data to cluster into 2 clusters: Intrusion or Not Intrusion

In [None]:
from sklearn.cluster import KMeans

#converting to numpy array : format needed by sklearn
X_pca_np = np.array(X_pca).astype(float)

#creating a model
kmeans = KMeans(n_clusters = 2, random_state = 0)

#fitting the data to the model
kmeans.fit(X_pca_np)

#plotting the points and the cluster centroids
plt.scatter(X_pca_np[:,0],X_pca_np[:,1], c = kmeans.labels_, cmap = 'rainbow') 
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')

#frequency of labels
predicted = kmeans.predict(X_pca_np)
unique, counts = np.unique(predicted, return_counts=True)
print(np.asarray((unique, counts)).T)
unique, counts = np.unique(y, return_counts=True)
print(np.asarray((unique, counts)).T)

In [None]:
#calculating the percentage of correct clusterings
from sklearn.metrics import accuracy_score
print("accuracy Score : ",accuracy_score(y, kmeans.predict(X_pca_np)))

#printing confusion matrix
from sklearn.metrics import confusion_matrix
print("Confusion Matrix\n",confusion_matrix(y, kmeans.predict(X_pca_np)))

#plotting confusion matrix
plt.imshow(confusion_matrix(y, kmeans.predict(X_pca_np)),
           cmap='Blues', interpolation='nearest')
plt.colorbar()
plt.grid(False)
plt.ylabel('true')
plt.xlabel('predicted');

#### Lets try with a change in hyperparameters to kmeans

In [None]:
#fitting data to modded kmeans model
kmeans_modded = KMeans(n_clusters = 2, random_state = 0, max_iter = 100, algorithm = 'auto')
kmeans_modded.fit(X_pca_np)

#calculating the percentage of correct labels
print("accuracy Score : ",accuracy_score(y, kmeans_modded.predict(X_pca_np)))
#printing confusion matrix
print("Confusion Matrix\n",confusion_matrix(y, kmeans_modded.predict(X_pca_np)))

#plotting confusion matrix
plt.imshow(confusion_matrix(y, kmeans_modded.predict(X_pca_np)),
           cmap='Blues', interpolation='nearest')
plt.colorbar()
plt.grid(False)
plt.ylabel('true')
plt.xlabel('predicted');

#### Lets try to scale the data and then run kmeans

In [None]:
scaler = preprocessing.MinMaxScaler()

#scaling the data
X_pca_np_scaled = scaler.fit_transform(X_pca_np)

#fitting the data to modded kmeans
kmeans_modded.fit(X_pca_np_scaled)

#caclulating the percentage of correct labels
print("accuracy Score : ",accuracy_score(y, kmeans_modded.predict(X_pca_np_scaled)))
#printing confusion matrix
print("Confusion Matrix\n",confusion_matrix(y, kmeans_modded.predict(X_pca_np_scaled)))





### Regression

#### Splitting the dataset
We split the data into Training and Test set based on the parameter train_size.

Example : if train_size = 0.70 then Training Set contains 80% of the data and Testing Set other 20%

In [None]:
# Use train_test_split in sklearn.cross_validation to split data into train and test sets
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.70, random_state=1)

In [None]:
# Function to build model and find model performance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

def find_model_perf(X_train, y_train, X_test, y_test, func):
    model = func()
    model.fit(X_train, y_train)
    y_hat = [x[1] for x in model.predict_proba(X_test)]
    auc = roc_auc_score(y_test, y_hat)
    
    return auc

In [None]:
# Find performance of model using preprocessed data
auc_processed = find_model_perf(X_train, y_train, X_test, y_test, LogisticRegression)
print(auc_processed)

This shows that Regression can correctly predict 98.5% of data

### Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier
randomForest = RandomForestClassifier(n_estimators=100, random_state=0)

In [None]:
randomForest.fit(X_train, y_train)

In [None]:
#caclulating the percentage of correct labels
print("accuracy Score : ",accuracy_score(y_test, randomForest.predict(X_test)))
#printing confusion matrix
print("Confusion Matrix\n",confusion_matrix(y_test, randomForest.predict(X_test)))

#plotting confusion matrix
plt.imshow(confusion_matrix(y_test, randomForest.predict(X_test)),
           cmap='Blues', interpolation='nearest')
plt.colorbar()
plt.grid(False)
plt.ylabel('true')
plt.xlabel('predicted');

### Conclusion

Random Forests is the best available predictor for intrusion detection with Accuracy Score of 99.01%