# Phishing URL Detection

## Imports

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Initialisation

In [46]:
df_main = pd.read_csv('dataset.csv')
df_main.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [47]:
df_main.shape

(549346, 2)

### Dataset Balancing

Choose first 100000 balanced rows for simplicity

In [48]:
df = pd.concat([df_main[df_main['Label']=='good'].sample(50000,random_state=123),df_main[df_main['Label']=='bad'].sample(50000,random_state=123)],ignore_index=True)
df.head()

Unnamed: 0,URL,Label
0,addons.mozilla.org/seamonkey,good
1,meyerweb.com/eric/books/css-pocket/,good
2,youmix.co.uk/search?q=Billie%20Holiday%20-%20S...,good
3,songofthewinds.com/Playlist.htm,good
4,h30097.www3.hp.com/unix/security-download.html,good


### Label Encoding

In [49]:
df['Label'] = df['Label'].replace(to_replace=['good','bad'],value=[0,1])
df.sample(5)

  df['Label'] = df['Label'].replace(to_replace=['good','bad'],value=[0,1])


Unnamed: 0,URL,Label
20626,roadsideamerica.com/hotels_motels/hotelinfo/13...,0
92747,superofertassrl.com.ar/media/.../a/,1
26566,monmouth-county.com/,0
39622,absoluteastronomy.com/topics/The_Everly_Brothers,0
79620,social-plugins.net/framework/models/doc/doc/do...,1


## Preprocessing

### Add Fields

In [50]:
import re

def valid_ip(url):
    ipv4 = "^((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.){3}(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])$"
    ipv6 = "((([0-9a-fA-F]){1,4})\\:){7}"\
            "([0-9a-fA-F]){1,4}"
    return 1 if re.search(ipv4, url) or re.search(ipv6, url) else 0

symbols = ['#', '$', '%', '&', '*', '+', '/', ':', ';', '=', '?', '@', '//', '~']

df['URL'] = [re.sub('www.','',url) for url in df['URL']]
df['Length'] = [len(url) for url in df['URL']]
df['Is_IP'] = [valid_ip(url) for url in df['URL']]
for symbol in symbols:
    df[symbol] = [url.count(symbol) for url in df['URL']]

df.head()

#
0    99856
1      137
2        4
3        2
4        1
Name: count, dtype: int64
$
0    99967
1       16
2       15
4        2
Name: count, dtype: int64
%
0      97158
1        852
2        670
3        430
6        351
       ...  
55         1
87         1
34         1
134        1
38         1
Name: count, Length: 61, dtype: int64
&
0     92407
1      4448
2      1317
3       555
4       211
5       186
10      173
6       159
9       155
8       110
7        82
12       78
13       58
11       36
14        9
18        4
27        3
34        2
19        2
16        2
17        1
20        1
25        1
Name: count, dtype: int64
*
0     99984
2         6
1         6
3         2
4         1
14        1
Name: count, dtype: int64
+
0     98116
1       932
2       306
6       170
3       161
4       102
5        65
7        46
9        33
8        31
10       11
11        7
18        5
12        2
13        2
14        2
21        2
15        2
37        1
27        1
17        1
16  

### Check Shortening Service

In [53]:
def is_shortened(URL):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net',
                      URL)
    return 1 if match else 0

df['is_Shorted'] = [is_shortened(url) for url in df['URL']]
df.head()

### Hostname and Scheme Check

In [74]:
import urllib

def check_scheme(url):
    return 1 if urllib.parse.urlparse(url).scheme in ['http','https'] else 0
    
def check_hostname(url):
    return 1 if urllib.parse.urlparse(url).hostname else 0

df['is_Host'] = [check_hostname(url) for url in df['URL']]
df['is_HTTP'] = [check_scheme(url) for url in df['URL']]

1