# Network Anomaly Detection - Machine Learning Model Deployment

**Import required Libraries**

In [2]:
import numpy as np
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler

In [142]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#Accuracy score, confusion matrix, classification report, ROC curve, AUC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score

**High Level Overview and basic info of the Dataset**

In [17]:
train_df = pd.read_csv("Network_anomaly_data_1.csv")
train_df.head(4)

Unnamed: 0,protocoltype,srcbytes,dstbytes,wrongfragment,loggedin,count,dsthostsrvcount,dsthostsrvserrorrate,lastflag,target
0,tcp,491,0,0,0,2,25,0.0,20,normal
1,udp,146,0,0,0,13,1,0.0,15,normal
2,tcp,0,0,0,0,123,26,1.0,19,attack
3,tcp,232,8153,0,1,5,255,0.01,21,normal


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   protocoltype          125973 non-null  object 
 1   srcbytes              125973 non-null  int64  
 2   dstbytes              125973 non-null  int64  
 3   wrongfragment         125973 non-null  int64  
 4   loggedin              125973 non-null  int64  
 5   count                 125973 non-null  int64  
 6   dsthostsrvcount       125973 non-null  int64  
 7   dsthostsrvserrorrate  125973 non-null  float64
 8   lastflag              125973 non-null  int64  
 9   target                125973 non-null  object 
dtypes: float64(1), int64(7), object(2)
memory usage: 9.6+ MB


### Binary Encoding of Categorical Variables

In [18]:
train_df['protocoltype']= train_df['protocoltype'].map({'tcp':1, 'udp':2, 'icmp':3})
train_df['target']= train_df['target'].map({'normal':0, 'attack':1})

In [14]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   protocoltype          125973 non-null  int64  
 1   srcbytes              125973 non-null  int64  
 2   dstbytes              125973 non-null  int64  
 3   wrongfragment         125973 non-null  int64  
 4   loggedin              125973 non-null  int64  
 5   count                 125973 non-null  int64  
 6   dsthostsrvcount       125973 non-null  int64  
 7   dsthostsrvserrorrate  125973 non-null  float64
 8   lastflag              125973 non-null  int64  
 9   target                125973 non-null  int64  
dtypes: float64(1), int64(9)
memory usage: 9.6 MB


### Checking for Missing Values

In [15]:
train_df.isnull().sum()

protocoltype            0
srcbytes                0
dstbytes                0
wrongfragment           0
loggedin                0
count                   0
dsthostsrvcount         0
dsthostsrvserrorrate    0
lastflag                0
target                  0
dtype: int64

In [19]:
## dropping all the missing values
train_df = train_df.dropna()
train_df.isnull().sum()

protocoltype            0
srcbytes                0
dstbytes                0
wrongfragment           0
loggedin                0
count                   0
dsthostsrvcount         0
dsthostsrvserrorrate    0
lastflag                0
target                  0
dtype: int64

### Segregating the target variable from the features

In [20]:
X = train_df[['protocoltype', 'srcbytes', 'dstbytes', 'wrongfragment', 'loggedin', 'count', 'dsthostsrvcount', 'lastflag']]
y = train_df.target
X.shape, y.shape

((125973, 8), (125973,))

### Splitting the data

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=5)

### Model Training

In [22]:
from sklearn import linear_model


model = linear_model.LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Cross Validation

In [23]:
from sklearn.metrics import accuracy_score

pred_val = model.predict(X_val)
accuracy_score(y_val, pred_val)

0.8810081365350267

In [24]:
pred_train = model.predict(X_train)
accuracy_score(y_train, pred_train)

0.8855504177499057

### Saving the model

In [25]:
# saving the model
import pickle

pickle_out = open("classifier.pkl", mode = "wb")
pickle.dump(model, pickle_out)
pickle_out.close()

### Test Prediction for a Sample Data Point

In [38]:
network_anomaly = {
    'protocoltype': 1,
    'srcbytes': 495,
    'dstbytes': 8155,
    'wrongfragment': 3,
    'loggedin': 0,
    'count' : 412,
    'dsthostsrvcount' : 213,
    'lastflag' : 20
}

#### Code for app.py

In [27]:
model_pickle = open("classifier.pkl", 'rb')
clf = pickle.load(model_pickle)

In [28]:
from flask import Flask, request, jsonify

In [42]:

def prediction(network_anomaly):
    # Pre-processing user input
#     anomaly = request.get_json()
    print(network_anomaly)

    if network_anomaly['protocoltype'] == "tcp":
        protocoltype = 1
    elif network_anomaly['protocoltype'] == "udp":
        protocoltype = 2
    else:
        protocoltype = 3

    srcbytes = network_anomaly['srcbytes']
    dstbytes = network_anomaly['dstbytes']
    wrongfragment = network_anomaly['wrongfragment']
    loggedin = network_anomaly['loggedin']
    count = network_anomaly['count']
    dsthostsrvcount = network_anomaly['dsthostsrvcount']
    lastflag = network_anomaly['lastflag']

    # Making predictions
    prediction = clf.predict([[protocoltype, srcbytes, dstbytes, wrongfragment, loggedin, count, dsthostsrvcount, lastflag]])

    if prediction == 0:
        pred = "Normal"
    else: 
        pred = "Anomolous"
        
    return pred

In [43]:
prediction(network_anomaly)

{'protocoltype': 1, 'srcbytes': 495, 'dstbytes': 8155, 'wrongfragment': 3, 'loggedin': 0, 'count': 412, 'dsthostsrvcount': 213, 'lastflag': 20}




'Anomolous'