In [12]:
import pandas as pd
import numpy as np 
import re

In [13]:
phishing_raw_df = pd.read_csv("/Users/swarupagowri/Desktop/MS Data Analytics/DATA245/Project/Datasets/Main_dataset.csv", header = 0)

In [14]:
phishing_raw_df = phishing_raw_df.dropna()

In [16]:
phishing_raw_df = phishing_raw_df.drop_duplicates()
phishing_raw_df.shape

(72363, 12)

In [17]:
print(phishing_raw_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72363 entries, 0 to 95909
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   domain          72363 non-null  object
 1   ranking         72363 non-null  int64 
 2   isIp            72363 non-null  int64 
 3   valid           72363 non-null  int64 
 4   activeDuration  72363 non-null  int64 
 5   urlLen          72363 non-null  int64 
 6   is@             72363 non-null  int64 
 7   isredirect      72363 non-null  int64 
 8   haveDash        72363 non-null  int64 
 9   domainLen       72363 non-null  int64 
 10  nosOfSubdomain  72363 non-null  int64 
 11  label           72363 non-null  int64 
dtypes: int64(11), object(1)
memory usage: 7.2+ MB
None


In [18]:
phishing_raw_df.head()

Unnamed: 0,domain,ranking,isIp,valid,activeDuration,urlLen,is@,isredirect,haveDash,domainLen,nosOfSubdomain,label
0,www.voting-yahoo.com,10000000,0,0,0,20,0,0,1,20,2,1
1,www.zvon.org/xxl/WSDL1.1/Output/index.html,194914,0,1,7305,42,0,0,0,12,2,0
2,tecportais.com/file-security-update-infonfmati...,10000000,0,0,0,155,0,0,0,14,1,1
3,bima.astro.umd.edu/nemo/linuxastro/,7001,0,0,0,35,0,0,0,18,3,0
4,huarui-tec.com/js/?us.battle.net/login/en/?ref...,10000000,0,1,730,79,0,0,1,14,1,1


In [19]:
phishing_raw_df.groupby(["label"])[["label"]].count()

Unnamed: 0_level_0,label
label,Unnamed: 1_level_1
0,31025
1,41338


In [20]:
# Function to check for '//' or 'http' in the domain part of the URL
# Returns 1 if '//' is found within length of 7 for the given URL or if 'http' is found else returns 0
# Legitimate - 0; Phishing - 1

def regexp_check(str_to_check,url):
    if str_to_check == '//':
        double_slash = re.search(str_to_check,url)
        if double_slash:
            if double_slash.start() > 7:
                return 1
            else:
                return 0
        else:
            return 0
    else:
        http_domain = re.search('http',url)
        if http_domain:
            return 1
        else:
            return 0


In [21]:
# Feature engineering

# Insert a new column to the dataframe which shows if the URL has a '//' in the domain part.
phishing_raw_df["dslash_Redirect"] = phishing_raw_df["domain"].apply( lambda x: regexp_check('//',x))

# Insert a new column to the dataframe which shows if 'http' is found in the domain part of the URL.
phishing_raw_df["Domain_http"] = phishing_raw_df["domain"].apply( lambda x: regexp_check('http',x))

# Insert a new column to the dataframe indication the length of the URL.
# If the lenght is less than 54 then it is a legitimate site else it is a phishing site.
phishing_raw_df["LongURL"] = phishing_raw_df["domain"].apply( lambda x: 0 if len(x) < 54 else 1)

# Alter the values in 'nosOfSubdomain' column of the dataframe.
# If the number of sub domains is more than 1, then it is a phishing site.
phishing_raw_df["nosOfSubdomain"] = phishing_raw_df["nosOfSubdomain"].apply( lambda x: 0 if x ==1 else 1)

# Alter the values in 'activeDuration' column of the dataframe.
# If the value is greater than 365 (1 year), then it is a legitimate site.
phishing_raw_df["activeDuration"] = phishing_raw_df["activeDuration"].apply( lambda x: 1 if x <= 365 else 0)

# Alter the values in the dataframe column 'ranking'
# If the value is greater than 100000, then it is a phishing site
phishing_raw_df["ranking"] = phishing_raw_df["ranking"].apply( lambda x: 0 if x < 100000 else 1)

# Remove the unwanted columns from the dataframe
phishing_raw_df.drop(['urlLen','domainLen'], inplace=True,axis=1)

# Reorder and rename the columns in the dataframe
Columns = ['Domain','Rank','isIP','isValid','Domain_reg_len','is@','isRedirect','haveDash','SubDomain','Label','dslash_Redirect','Domain_http','LongURL']
new_columns = ['Domain','Rank','isIP','isValid','Domain_reg_len','is@','isRedirect','haveDash','SubDomain','dslash_Redirect','Domain_http','LongURL','Label']
phishing_raw_df.columns = Columns
phishing_raw_df = phishing_raw_df.reindex(columns=new_columns)

#Display the dataset after feature engineering
phishing_raw_df.head()

Unnamed: 0,Domain,Rank,isIP,isValid,Domain_reg_len,is@,isRedirect,haveDash,SubDomain,dslash_Redirect,Domain_http,LongURL,Label
0,www.voting-yahoo.com,1,0,0,1,0,0,1,1,0,0,0,1
1,www.zvon.org/xxl/WSDL1.1/Output/index.html,1,0,1,0,0,0,0,1,0,0,0,0
2,tecportais.com/file-security-update-infonfmati...,1,0,0,1,0,0,0,0,0,0,1,1
3,bima.astro.umd.edu/nemo/linuxastro/,0,0,0,1,0,0,0,1,0,0,0,0
4,huarui-tec.com/js/?us.battle.net/login/en/?ref...,1,0,1,0,0,0,1,0,0,0,1,1


In [22]:
phishing_raw_df.describe()

Unnamed: 0,Rank,isIP,isValid,Domain_reg_len,is@,isRedirect,haveDash,SubDomain,dslash_Redirect,Domain_http,LongURL,Label
count,72363.0,72363.0,72363.0,72363.0,72363.0,72363.0,72363.0,72363.0,72363.0,72363.0,72363.0,72363.0
mean,0.69161,0.000207,0.574727,0.43098,0.002709,0.007504,0.155508,0.767768,0.007476,0.019527,0.389343,0.571259
std,0.461832,0.014396,0.494388,0.495217,0.051974,0.0863,0.36239,0.422259,0.086142,0.138367,0.487605,0.4949
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
75%,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
