In [188]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline

In [189]:
train = pd.read_csv('../input/network-intrusion-detection/train_data.txt',sep=',')
train.head()

In [190]:
test=pd.read_csv('../input/network-intrusion-detection/test_data.txt',sep=',')
test.head()

In [191]:
columns=["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent","hot",
         "num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations", 
         "num_shells","num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count","srv_count","serror_rate",
         "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate","srv_diff_host_rate",
         "dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
         "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate",
         "dst_host_srv_rerror_rate","attack","last_flag"] 

In [192]:
len(columns)

In [193]:
train.columns=columns
test.columns=columns

In [194]:
train.head()

In [195]:
test.head()

In [196]:
train.info()

In [197]:
test.info()

In [198]:
train.describe().T

In [199]:
test.describe()

In [200]:
train.loc[train.attack=='normal','attack_class']=0

train.loc[(train.attack=='back') | (train.attack=='land') | (train.attack=='pod') | (train.attack=='neptune') | 
         (train.attack=='smurf') | (train.attack=='teardrop') | (train.attack=='apache2') | (train.attack=='udpstorm') | 
         (train.attack=='processtable') | (train.attack=='worm') | (train.attack=='mailbomb'),'attack_class']=1

train.loc[(train.attack=='satan') | (train.attack=='ipsweep') | (train.attack=='nmap') | (train.attack=='portsweep') | 
          (train.attack=='mscan') | (train.attack=='saint'),'attack_class']=2

train.loc[(train.attack=='guess_passwd') | (train.attack=='ftp_write') | (train.attack=='imap') | (train.attack=='phf') | 
          (train.attack=='multihop') | (train.attack=='warezmaster') | (train.attack=='warezclient') | (train.attack=='spy') | 
          (train.attack=='xlock') | (train.attack=='xsnoop') | (train.attack=='snmpguess') | (train.attack=='snmpgetattack') | 
          (train.attack=='httptunnel') | (train.attack=='sendmail') | (train.attack=='named'),'attack_class']=3

train.loc[(train.attack=='buffer_overflow') | (train.attack=='loadmodule') | (train.attack=='rootkit') | (train.attack=='perl') | 
          (train.attack=='sqlattack') | (train.attack=='xterm') | (train.attack=='ps'),'attack_class']=4
train.head()

In [201]:
test.loc[test.attack=='normal','attack_class']=0

test.loc[(test.attack=='back') | (test.attack=='land') | (test.attack=='pod') | (test.attack=='neptune') | 
         (test.attack=='smurf') | (test.attack=='teardrop') | (test.attack=='apache2') | (test.attack=='udpstorm') | 
         (test.attack=='processtable') | (test.attack=='worm') | (test.attack=='mailbomb'),'attack_class']=1

test.loc[(test.attack=='satan') | (test.attack=='ipsweep') | (test.attack=='nmap') | (test.attack=='portsweep') | 
          (test.attack=='mscan') | (test.attack=='saint'),'attack_class']=2

test.loc[(test.attack=='guess_passwd') | (test.attack=='ftp_write') | (test.attack=='imap') | (test.attack=='phf') | 
          (test.attack=='multihop') | (test.attack=='warezmaster') | (test.attack=='warezclient') | (test.attack=='spy') | 
          (test.attack=='xlock') | (test.attack=='xsnoop') | (test.attack=='snmpguess') | (test.attack=='snmpgetattack') | 
          (test.attack=='httptunnel') | (test.attack=='sendmail') | (test.attack=='named'),'attack_class']=3

test.loc[(test.attack=='buffer_overflow') | (test.attack=='loadmodule') | (test.attack=='rootkit') | (test.attack=='perl') | 
          (test.attack=='sqlattack') | (test.attack=='xterm') | (test.attack=='ps'),'attack_class']=4
test.head()

In [202]:
print(train.shape)
print(test.shape)

In [203]:
# making basic analysis from the given data 
plt.figure(figsize=(6,3))
sns.countplot(x="protocol_type",data=train)
plt.show()

In [204]:
#service distribution 
plt.figure(figsize=(6,10))
sns.countplot(y='service',data=train)

In [205]:
#flag distribution 
plt.figure(figsize=(6,3))
sns.countplot(x='flag',data=train)
plt.show()

In [206]:
#attack distribution 
plt.figure(figsize=(6,10))
sns.countplot(y='attack',data=train)
plt.show()

In [207]:
#attack class distribution 
plt.figure(figsize=(6,3))
sns.countplot(x='attack_class',data=train)
plt.show()

In [208]:
train.groupby('attack_class').mean().T

In [209]:
num_var=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
cat_var=[key for key in dict(train.dtypes) if dict(train.dtypes)[key]in ['object','O']]

In [210]:
num_var

In [211]:
cat_var 

In [212]:
train_num=train[num_var]
train_num

In [213]:
train_cat=train[cat_var]
train_cat.describe()

In [214]:
test_num=test[num_var]
test_cat=test[cat_var]
test_cat.describe()

In [215]:
## categorical encoding 
def cat_encoding( df, colname ):
    col_dummies = pd.get_dummies(df[colname], prefix=colname,drop_first=True)
    df = pd.concat([df, col_dummies], axis=1)
    df.drop( colname, axis = 1,inplace=True)
    return(df)

In [216]:
for x in cat_var:
    train_cat = cat_encoding(train_cat,x)
    test_cat=cat_encoding(test_cat,x)
train_cat.head()

In [217]:
train_data=pd.concat([train_num,train_cat],axis=1)
test_data=pd.concat([test_num,test_cat],axis=1)
train_data.head()

In [218]:
## building a correlation matrix 
corr_mat=train_data.corr()
corr_mat

In [219]:
sns.heatmap(corr_mat)

In [220]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import VarianceThreshold

In [221]:
np.seterr(divide='ignore', invalid='ignore')

In [222]:
X = train_data[train_data.columns.difference(['attack_class'])]
X_new = SelectKBest(f_classif, k=15).fit(X, train_data['attack_class'] )

In [223]:
X_new.get_support()

In [224]:
features=X.columns[X_new.get_support()]
features

In [225]:
train=train_data 
test=test_data
X_train=train[['attack_neptune', 'attack_normal', 'attack_satan', 'count',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_same_srv_rate', 'dst_host_srv_count', 'flag_S0', 'flag_SF',
       'last_flag', 'logged_in', 'same_srv_rate', 'serror_rate',
       'service_http']]
Y_train=train['attack_class']

In [226]:
X_test=test_data[['attack_neptune', 'attack_normal', 'attack_satan', 'count',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_same_srv_rate', 'dst_host_srv_count', 'flag_S0', 'flag_SF',
       'last_flag', 'logged_in', 'same_srv_rate', 'serror_rate',
       'service_http']]
Y_test=test_data['attack_class']

In [227]:
##Building Neural Network 
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
scaler = StandardScaler()
scaler.fit(X_train)
train_X = scaler.transform(X_train)
test_X = scaler.transform(X_test)

In [228]:
mlp = MLPClassifier(hidden_layer_sizes=(30,30))
mlp.fit(train_X,Y_train)

In [229]:
Y_pred=mlp.predict(test_X)
Y_pred

In [230]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(Y_test,Y_pred))

In [231]:
print(classification_report(Y_test,Y_pred))

In [232]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,Y_pred)*100