# Task : Prediction of Security Threats on Network

In [9]:
# Steps to Follow: 

# 1) Most targeted Destination IP Address
# 2) Most Logical Ports attacked
# 3) Most Frequently/common type of Attack
# 4) Find the time of the day when attack happend , (odd , hours, day or night)
# 5) Find the Pattern

In [18]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ipaddress
import numpy as np
from scipy import stats

# Hypothisis Testing
from scipy.stats import chi2_contingency
from datetime import datetime, timedelta
import math

# Gives Missing Values Graph
import missingno as msno


plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore') 

In [19]:
data = pd.read_csv("Cybersecurity_attacks.csv")
data.shape

(178031, 10)

In [20]:
data.columns

Index(['Attack category', 'Attack subcategory', 'Protocol', 'Source IP',
       'Source Port', 'Destination IP', 'Destination Port', 'Attack Name',
       'Attack Reference', 'Time'],
      dtype='object')

In [21]:
data.head()

Unnamed: 0,Attack category,Attack subcategory,Protocol,Source IP,Source Port,Destination IP,Destination Port,Attack Name,Attack Reference,Time
0,Reconnaissance,HTTP,tcp,175.45.176.0,13284,149.171.126.16,80,Domino Web Server Database Access: /doladmin.n...,-,1421927414-1421927416
1,Exploits,Unix 'r' Service,udp,175.45.176.3,21223,149.171.126.18,32780,Solaris rwalld Format String Vulnerability (ht...,CVE 2002-0573 (http://cve.mitre.org/cgi-bin/cv...,1421927415-1421927415
2,Exploits,Browser,tcp,175.45.176.2,23357,149.171.126.16,80,Windows Metafile (WMF) SetAbortProc() Code Exe...,CVE 2005-4560 (http://cve.mitre.org/cgi-bin/cv...,1421927416-1421927416
3,Exploits,Miscellaneous Batch,tcp,175.45.176.2,13792,149.171.126.16,5555,HP Data Protector Backup (https://strikecenter...,CVE 2011-1729 (http://cve.mitre.org/cgi-bin/cv...,1421927417-1421927417
4,Exploits,Cisco IOS,tcp,175.45.176.2,26939,149.171.126.10,80,Cisco IOS HTTP Authentication Bypass Level 64 ...,CVE 2001-0537 (http://cve.mitre.org/cgi-bin/cv...,1421927418-1421927418


In [22]:
data[['Start Time', 'Last Time']] = data['Time'].str.split("-", expand = True)
data.tail()

Unnamed: 0,Attack category,Attack subcategory,Protocol,Source IP,Source Port,Destination IP,Destination Port,Attack Name,Attack Reference,Time,Start Time,Last Time
178026,Generic,IXIA,udp,175.45.176.0,72349,149.171.126.12,53,Microsoft_DNS_Server_ANY_Query_Cache_Weakness_...,CVE 2009-0234 (http://cve.mitre.org/cgi-bin/cv...,1424224338-1424224338,1424224338,1424224338
178027,Exploits,Browser,sep,175.45.176.3,67647,149.171.126.18,0,Persits XUpload ActiveX Method MakeHttpRequest...,CVE 2009-3693 (http://cve.mitre.org/cgi-bin/cv...,1424249567-1424249567,1424249567,1424249567
178028,Exploits,Office Document,tcp,175.45.176.0,78359,149.171.126.13,110,Microsoft Excel SxView Memory Corruption (POP3...,CVE 2009-3128 (http://cve.mitre.org/cgi-bin/cv...,1424219921-1424219923,1424219921,1424219923
178029,Exploits,Browser,tcp,175.45.176.2,68488,149.171.126.19,80,Internet Explorer createTextRange() Code Execu...,CVE 2006-1359 (http://cve.mitre.org/cgi-bin/cv...,1424247789-1424247789,1424247789,1424247789
178030,Reconnaissance,ICMP,unas,175.45.176.3,77929,149.171.126.19,0,IP Options: Loose Source Route (IP Option 3) (...,,1424230421-1424230421,1424230421,1424230421


In [23]:
df = data.drop(['.', 'Time'], axis = 1)
df.head()

KeyError: "['.'] not found in axis"

In [None]:
figure, (ax1, ax2) = plt.subplots(1, 2, figsize = (15, 5))
msno.matrix(df, ax = ax1, sparkline=False) 
msno.bar(df, ax = ax2, color = (0.15, 0.35, 0.65)) 
plt.show()


In [10]:
data.isnull().sum()

Attack category           0
Attack subcategory     4192
Protocol                  0
Source IP                 0
Source Port               0
Destination IP            0
Destination Port          0
Attack Name               0
Attack Reference      51745
Time                      0
Start Time                0
Last Time                 0
dtype: int64

In [11]:
# Update Missing values with Not Registered

df["Attack subcategory"] = df["Attack subcategory"].fillna("Not Registered")

NameError: name 'df' is not defined

In [12]:
# Update Missing values with NaN

# df['Attack Reference'] = df['Attack Reference'].fillna("NaN")
# df['Start Time'] = df['Start Time'].fillna("NaN")
# df['Last Time'] = df['Last Time'].fillna("NaN")
df.isnull().sum()

NameError: name 'df' is not defined

In [None]:
df[pd.isnull(df).any(axis=1)].shape

In [None]:
df[df.duplicated()].shape
# df[df.duplicated()]



In [None]:
print('Before Cleaning of data: ' + str(df.shape))
df = df.drop(df[df.duplicated()].index)
print('After Cleaning of data: ' + str(df.shape))

In [None]:
df[df.duplicated()]

Some ports are less than 0 and more than 65535

In [None]:
# Count the source and destination ports which are above or below the Limit

invalid_SP = (df['Source Port'] < 0) | (df['Source Port'] > 65535)
invalid_DP = (df['Destination Port'] < 0) | (df['Destination Port'] > 65535)
df[invalid_SP | invalid_DP].shape

In [None]:
# Remove the invalid Source and Destination Ports

df = df[~(invalid_SP | invalid_DP)].reset_index(drop=True)

In [None]:
print("Total Number of Different Portocol : ", len(df['Protocol'].unique()))
print("Total Number of Different Attack category : ", len(df['Attack category'].unique()))

df['Protocol'].unique()[:15]


In [None]:
df['Attack category'].unique()[:15]

In [None]:
# Make All Protocols and Categories in Upper case

df['Protocol'] = df['Protocol'].str.upper().str.strip()
df['Attack category'] = df['Attack category'].str.upper().str.strip()
df['Attack category'] = df['Attack category'].str.strip().replace('BACKDOORS', 'BACKDOOR')
# df

In [None]:
# Unique Protocols and Categories after filtering

print(df['Protocol'].unique())
print(df['Attack category'].unique())

In [None]:
# Check how Many Unique Protocol Present Now.

print('Total number of different protocols:', len(df['Protocol'].unique()))
print('Total number of different Attack categories:', len(df['Attack category'].unique()))

In [None]:
df[pd.isnull(df['Attack Reference'])].shape

In [None]:
# No of Attack Categories in Which Attack Reference is missing

print(df[pd.isnull(df['Attack Reference'])]['Attack category'].value_counts())

In [None]:

print(df['Attack category'].value_counts())

In [None]:
# Percentage of missing values in Attack Reference per Attack Category

((df[pd.isnull(df['Attack Reference'])]['Attack category'].value_counts()/df['Attack category'].value_counts())*100).dropna().sort_values(ascending=False)

In [None]:
df.head()

<hr>
<h2>Import New Dataset which include which port provide which type of service</h2>

In [None]:
tcp_ports = pd.read_csv("TCP-ports.csv")
tcp_ports['Service'] = tcp_ports['Service'].str.upper()
tcp_ports.shape

In [None]:
# tcp_ports.isnull().sum()

tcp_ports.head()

In [None]:
# Merge Two dataset into one

print('Columns before merging dataframes: ' ,(df.shape))

newdf = pd.merge(df, tcp_ports[['Port','Service']], left_on='Destination Port', right_on='Port', how='left')
newdf = newdf.rename(columns={'Service':'Destination Port Service'})

print('Columns after merging dataframes: ' + str(newdf.shape))

In [None]:
newdf.head()

In [None]:
# Drop Port Column From new dataset

newdf = newdf.drop(columns=['Port'])
newdf.head()

In [None]:

newdf['Attack category'].unique()

In [None]:
# How many categories count present in the dataset

# newdf['Attack category'].value_counts()
pd.DataFrame(newdf['Attack category'].value_counts())[:]


In [None]:
# What percent of perticular attack with respect to total no of attacks

newdf['Attack category'].value_counts()*100/newdf['Attack category'].value_counts().sum()

In [None]:
# Graphical Representation of No of Types of attack

plt.figure(figsize=(18,8))
sns.barplot(x=newdf['Attack category'].value_counts().index,y=newdf['Attack category'].value_counts())
plt.xlabel('Attack Category')
plt.ylabel('Count')
plt.title('Number of attacks per Attack caterogy')
plt.grid(True)

In [None]:
# Create dataset of top 5 attack categories for Visualization

a = pd.DataFrame(newdf['Attack category'].value_counts())[:6]

In [None]:
a.plot(kind='pie', subplots=True, figsize=(7, 7))
plt.title('Top 5 attacks')
# plt.legend(loc='bottom')
plt.show()

<h2>ANALYSE ATTACKS WITH DATE AND TIME

In [None]:
# Convert The raw date n time to standard format of date time for better prediction using build in modules of pandas.


newdf['Start Time'] = pd.to_datetime(newdf['Start Time'], unit = 's')
newdf['Last Time'] = pd.to_datetime(newdf['Last Time'], unit = 's')

# Create new df for duration of attack based on start and last time.
newdf['Duration'] = ((newdf['Last Time'] - newdf['Start Time']).dt.seconds).astype(int)
newdf.head()

In [None]:
# So it will convert datatype from int to str and find the 1st unique pair

newdf['Start Time'].astype(str).str.split(' ').str[0].unique()

<h3>Here we can execute from now on is based on information related to two days, 22nd January 2015, and on 18th February 2015.</h3>

In [None]:
newdf.describe()

Mean and 75% percentile is very different for Source Port and Destination Port is very different. 
However minimum and maximum is same. 

So Here we can use Hypothesis testing.

Here we are assuming source ports are random but attacker try to use lower random ports as destination ports.
So we have to check wheather our hypothisis assumption is correct or not, Because if is not correct so we would be adding biased to it. Because we are thinking from our point of view.
But we have to check this point statistically, if data is showing that pattern so we have to use Hypothisis Testing.

Hypothsis Testing is testing our assumption / hypothisis with respective the data.
Data is showing some pattern, so if we are assuming this test as correct pattern so we have to perform Hypothesis testing.


$$ H_0: \mu_1=\mu_2$$
$$ H_a: \mu_1\neq\mu_2$$

1. $ H_0: $ Null Hypothesis
        It means no change, mean of source port and destination port is same

2. $ H_a: $ Alternate Hypothesis
        It is not same, mean of source port and destination port are not same
        There would be difference in between source and dest port mean
        And we are proving there is difference and our hyposthesis is correct.

We can obtain one of two results from the test:

1. If the **$p$-value** is less than our significance level ($p<\alpha$) we reject the null hypothesis $H_0$ and affirm that the observed difference is **statistically significant**. 
        The minimum significance value is 0.05 if our significance value is greater than 0.05
        Then our assumption is right and there is difference between values.
        So we can say that this is Alternate Hypothesis.
2. If the **$p$-value** is greater than our significance level ($p>\alpha$) we will have to retain $H_0$ and conclude that the observed difference **is not statistically significant**. 
        If our significance value is less than 0.05
        Then our assumption is wrong and there is no difference between values.
        So we can say that this is Null Hypothesis.

The hypothesis test is conducted using a statistical **$T-test$** which specifies the two Series `df_interest['Source Port']` and `df_interest['Destination Port']`. By specifying these two Series, we are automatically referring to a comparative test of the means of both Series:

In [None]:
statistic, pvalue = stats.ttest_ind( newdf['Source Port'], newdf['Destination Port'], equal_var=False)
print('p-value in T-test: ' + str(pvalue))

Because the $p$-value is very close to zero, Python approximates this measurement to 0.0. With this in mind, we can reject the null hypothesis $H_0$ regarding the equality of the means of the source and destination ports. This means that the source port of the attacks is very different from the destination port. In this way, most attacks are directed at specific logical ports, which does not leave a random spot



---
We will be using two methods for correlation calculation:


> • **Pearson's correlation:** evaluates the linear relationships between two variables. If the value is close to 0, there is a weak or nonexistent linear relationship between the variables.


> • **Spearman's correlation:** evaluates the monotonic relationships between two variables. If the value is close to 0, there is a weak or nonexistent monotonic relationship between the variables.



In [None]:
newdf.corr(method='pearson')

1. It shows that correlation is very low i.e. **0.13**.
2. It means correlation is not linear.

In [None]:
newdf.corr(method='spearman')
# newdf.shape

1. Using Spearman it is showing different values.
2. Which means you don't need to depend only on Pearson's method.
3. Sometimes there is no linear relationship but should be Monotonic Relationship sometimes. Which means non-linear relationship.

In [None]:
# Here we convert categorical data to a numerical data
# It will create separate column for each category and mark with the numerical value.

dummy_df = pd.get_dummies(newdf, columns = ['Attack category'])

dummy_df.head()
# dummy_df.shape

In [None]:
# Create a heat Matrix using Pearson's Co-efficient
# It will show the correlation with respect to the Attack Category


plt.figure(figsize=(20,8))
sns.heatmap(dummy_df.corr(method='pearson'), 
            annot=True, vmin=-1.0, vmax=1.0, cmap = sns.color_palette("RdBu_r", 15))
plt.show()

In [None]:
# Create a heat Matrix using Spearman's Co-efficient
# It will show the correlation with respect to the Attack Category
# So here so many values filled with colors,
# That means no-correlation values are reduced by using Spearman's Co-efficient


plt.figure(figsize=(20,8))
sns.heatmap(dummy_df.corr(method='spearman'), 
            annot=True, vmin=-1.0, vmax=1.0, cmap = sns.color_palette("RdBu_r", 15))
plt.show()

In [None]:
g = sns.pairplot(newdf)
g.fig.set_size_inches(14,8)
plt.show()

1. Here we can see some pattern here
2. If look close then see lot of lower destination ports are having a higher duration of attack.
3. We can see some pattern where lower dest port are showing high duration of attack of time, and on higher port attack duration is less.

In [None]:
# Now will check how many victims IP's we have in dataset
# Which attacker is using as a target.

# newdf['Destination IP'].value_counts()

pd.DataFrame(newdf['Destination IP'].value_counts())


In [None]:
plt.figure(figsize=(18,7))
sns.scatterplot(x = newdf[newdf['Destination IP']=='149.171.126.17']['Start Time'], y=newdf[newdf['Destination IP']=='149.171.126.17']['Destination Port'])
# plt.xlim(left = newdf['Start Time'].min()-timedelta(days=1),right=newdf['Start Time'].max()+timedelta(days=1))
# plt.grid(True)
plt.show()

In [None]:
# So here we are Zoom in on left side to clearly see the pattern

plt.figure(figsize=(18,7))
sns.scatterplot(x = newdf[newdf['Destination IP']=='149.171.126.17']['Start Time'], y=newdf[newdf['Destination IP']=='149.171.126.17']['Destination Port'])
plt.xlim(left=newdf['Start Time'].min(),right=datetime.strptime('15-01-23', '%y-%m-%d'))
plt.grid(True)
plt.show()

In [None]:
# So here we are Zoom in on right side to clearly see the pattern


plt.figure(figsize=(18,7))
sns.scatterplot(x = newdf[newdf['Destination IP']=='149.171.126.17']['Start Time'], y=newdf[newdf['Destination IP']=='149.171.126.17']['Destination Port'])
plt.xlim(left=datetime.strptime('15-02-18', '%y-%m-%d'),right=newdf['Start Time'].max())
# plt.grid(True)
plt.show()

In [None]:
# Here we zoom into lower range as we see lot of noise there.
# So here we consider the ports under 201 for visualization.
# Here we use Attack Category as a hue, so that we can clearly se which Attack is frequently used for specific port.


plt.figure(figsize=(20,10))
sns.scatterplot(x = 'Start Time', y = 'Destination Port', hue = 'Attack category', 
                data = newdf[(newdf['Destination IP'] == '149.171.126.17') & (newdf['Destination Port'] <= 150)], s = 80 )

plt.xlim(left=datetime.strptime('15-02-18 00:00:00', '%y-%m-%d %H:%M:%S'),
         right=datetime.strptime('15-02-18 13:00:00', '%y-%m-%d %H:%M:%S'))
plt.grid(True)
plt.show()


1. Here we can see that there are mainly Generic Attacks on Port zero.
2. And we can see clearly that on Port No 80 There are Huge Frequency of Attacks of all Categories.
3. And we know that it is used by HTTP protocol,So we can say that Port 80 is vulnerable for attacks.
4. Similarly Port 110 is also have huge amount of frequency of Attack. and it is assigned to POP3 protocol. Which used for unencrypted assess to Emails.

### Duration and Destination Ports

In [None]:
# So here we are using only one IP is which is attacked most frequently i.e. 149.171.126.17

pd.DataFrame(newdf['Destination IP'].value_counts())[:1]

In [None]:
# 

plt.figure(figsize=(20,10))
sns.scatterplot(x = 'Destination Port', y = 'Duration', hue = 'Attack category', 
                data = newdf[newdf['Destination IP'] == '149.171.126.17'])
# plt.grid(True)
# plt.show()

1. Here we can see on lower ports the attacks last Higher durations and on higher ports attack lasts Short Duration of time.


In [None]:
# So here we can draw violin plot.
# It is conmbination of Box plot and Probality Distribution graph.

plt.figure(figsize=(20,10))
sns.violinplot(x = 'Attack category', y = 'Duration', data = newdf)

Now we can analyzing between the attacks and the hour, at which perticular hour no of attacks getting increases.
So we just fetch the hours from Start Time

In [None]:
newdf['Start Time'][1].hour


In [None]:
# Here we create new dataframe which is Hour, which will fetch hour from start time 

pivotDF = newdf.copy()
pivotDF['Hour'] = pivotDF.apply(lambda row: '0'*(2-len(str(row['Start Time'].hour))) + str(row['Start Time'].hour)+':00:00', axis=1)

pivotDF.head()

In [None]:
# So here we create pivot table
# It will show that how many no of attacks of different categories are happened at which Hour Time

pivotDF1 = pd.pivot_table(pivotDF,values='Attack Name', index=['Hour'], columns=['Attack category'], aggfunc='count')
pivotDF1

#### Here we create function to plot heatmap so no need to repeat the task again and again.

In [None]:
def heatMap(df, xlabel, ylabel, title):
    plt.figure(figsize=(20,10))
    ax = sns.heatmap(df)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)

    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.show()

To see clear pattern of attack categories of perticular hours we now using heatmap. And for that we gone use funtion create earlier before.

In [None]:
heatMap(pivotDF1, 'Attack category', 'Hour', 'No of attacks per hour and attack type')

1. So clearly we can see that the DOS attacks pattern are incresed during odd hours and decreased during even hours.
2. Simmilary happend with Exploits attack. These type of attack Incresed during odd hours. At midnight attacks increases.
3. And from this pattern we can see that attackers know something about organization. So it could be reason so attacker are planning to attack only on odd hours.
4. There could be reason such as, emp on rest or is there any backup or security process runs during odd hours.



In [None]:
heatMap(pivotDF1/pivotDF1.sum(), 'Attack category', 'Hour', 'Percentage of attacks attack/hour time')

<hr><br>
To see clear pattern of percentage of attack per IP with respective to hours we now using heatmap.
1. Here we select Destinations IPs which all are the Destinations IP which are gatting attacked and is there any pattern between those IP addresses.
2. Like one IP getting more attacks on Odd hours/ even hours.

In [None]:
pivotDF2 = pd.pivot_table(pivotDF, values='Attack Name', index=['Hour'], columns=['Destination IP'], aggfunc='count')
heatMap(pivotDF2/pivotDF2.sum(), 'Destination IP', 'Hour', 'Percentage of attacks per IP and hour')

So here we can se that, for example 149.171.126.13 IP address getting attacked More in 01:00:00 Hour, Simillarly with 149.171.126.19 IP attacked at 3 O'clock
<hr>

Although there are more than 170,000 records related to cybersecurity attacks, these attacks target 10 servers with IPv4 addresses in the interval  [149.171.126.10 --> 149.171.126.19] . 

This means that the company was attacked on multiple occasions during the observed timeframe, on a specific subnet, which denotes a non-accidental and directed succession of events.

<hr><br>

Here we are showing HeatMap for Different Attacks per Destination IP address


Which showing for which perticular IP There are which type of attacks more

In [None]:
pivotDF3 = pd.pivot_table(pivotDF, values='Attack Name', index=['Destination IP'], columns=['Attack category'], aggfunc='count')
heatMap(pivotDF3/pivotDF3.sum(), 'Attack category', 'Destination IP', 'No of Attack Types/IP')

Let's now look at this same relationship per attack category performing a pair-wise **$T-test$**:

In [None]:
for attack in list(newdf['Attack category'].unique()):
    df_attack = newdf[newdf['Attack category'] == attack].copy()
    statistic, pvalue = stats.ttest_ind(df_attack['Source Port'], df_attack['Destination Port'], equal_var=False)
    print('p-value in T-test for ' + attack + ' attack: ' + str(pvalue))

In [None]:
# Source ports
plt.figure(figsize=(16,5))
sns.stripplot(x='Attack category',y='Source Port',data=newdf)
plt.show()

In [None]:
# Destination ports
plt.figure(figsize=(16,5))
sns.stripplot(x='Attack category',y='Destination Port',data=newdf)
plt.show()

Here we can view of the distribution of destination ports by attack category and source IP:

In [None]:
ips = list(newdf['Source IP'].unique())
f, axes = plt.subplots(2, 2)
f.set_figheight(10)
f.set_figwidth(15)

labels = list(newdf['Attack category'].unique())
for i, ip in enumerate(ips):
    sns.stripplot(x='Attack category',y='Destination Port',data=newdf[newdf['Source IP'] == ip], order=labels, ax=axes[int(i/2)][i%2])
    axes[int(i/2)][i%2].set_xlabel('Attack category')
    axes[int(i/2)][i%2].set_ylabel('Destination Port')
    axes[int(i/2)][i%2].set_title('Destination Port distribution - Attacker IPv4 Address: ' + ip)
    axes[int(i/2)][i%2].set_xticklabels(labels,rotation=90)
plt.tight_layout()
plt.show()

Here we can view of the distribution of destination ports by attack category and destination IP:

In [None]:
ips = list(newdf['Destination IP'].unique())
f, axes = plt.subplots(5, 2)
f.set_figheight(25)
f.set_figwidth(15)

labels = list(newdf['Attack category'].unique())

for i, ip in enumerate(ips):
    sns.stripplot(x='Attack category',y='Destination Port',data=newdf[newdf['Destination IP'] == ip], order=labels, ax=axes[int(i/2)][i%2])
    axes[int(i/2)][i%2].set_xlabel('Attack category')
    axes[int(i/2)][i%2].set_ylabel('Destination Port')
    axes[int(i/2)][i%2].set_title('Destination Port distribution - Target IPv4 Address: ' + ip)
    axes[int(i/2)][i%2].set_xticklabels(labels,rotation=90)
plt.tight_layout()
plt.show()


<li>These graphs show us that there is a differentiation in the way in which the attacks are performing their tasks.

<li>There is a particularization by the targets, something that does not happen with the source devices.
