In [3]:
import io

from urllib.parse import urlparse
import tldextract

import numpy as np
import pandas as pd
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel, RobertaForSequenceClassification
import fasttext

In [4]:
phish_df = pd.read_csv('../dataset/phishtank_data_corpus_complete_15-07-23.csv')
nonphish_df = pd.read_csv('../dataset/original_website_dataset.csv')

In [5]:
phish_df.head()

Unnamed: 0,id,url,is_valid,phishing_url,submission_time
0,8225238,https://phishtank.org/phish_detail.php?phish_i...,Unknown,https://pocztapolsk.buzz/pay,Jul 15th 2023 9:45 AM
1,8225237,https://phishtank.org/phish_detail.php?phish_i...,VALID PHISH,https://alertsuser.github.io/pantek/kimax/,Jul 15th 2023 9:38 AM
2,8225236,https://phishtank.org/phish_detail.php?phish_i...,Unknown,https://digicamforums.com,Jul 15th 2023 9:19 AM
3,8225235,https://phishtank.org/phish_detail.php?phish_i...,Unknown,SPAMs://wirelessworldshop.com/,Jul 15th 2023 9:19 AM
4,8225234,https://phishtank.org/phish_detail.php?phish_i...,Unknown,SPAMs://www.wirelessworldshop.com,Jul 15th 2023 9:19 AM


In [6]:
nonphish_df.head()

Unnamed: 0,url,Speical_Char,Have_IP,Have_At,URL_length,URL_Depth,redirection,time_get_redirect,port_in_url,use_http,...,unescape,escape,ActiveXObject,fromCharCode,atob,Punny_Code,TLDs,Title,country_name,label
0,https://sites.google.com/site/policyclaming76745/,3,0,0,0,2,6,0,0,0,...,0,0,0,0,0,0,com,Google Sites,US,1
1,https://www.pinterest.com/abbiestever/?redirec...,5,0,0,0,1,6,0,0,0,...,0,0,0,0,0,0,com,Abbie Roose Stever (abbiestever) - Profile | P...,,0
2,https://www.pinterest.com/abiolatv/bombshell-g...,3,0,0,1,2,6,0,0,0,...,0,0,0,0,0,0,com,No Title,,0
3,https://balajipackersguntur.com/images/,2,0,0,0,1,6,0,0,0,...,0,0,0,0,0,0,com,Index of /images,IN,1
4,https://sites.google.com/site/claming564336670...,3,0,0,0,2,6,0,0,0,...,0,0,0,0,0,0,com,SECURITY,,1


In [7]:
phish_df.shape, nonphish_df.shape

((99923, 5), (167872, 32))

### df merge

#### Label

phishing: -1\
genuine: 1

In [8]:
phish_df['phishing_url']==''

0        False
1        False
2        False
3        False
4        False
         ...  
99918    False
99919    False
99920    False
99921    False
99922    False
Name: phishing_url, Length: 99923, dtype: bool

In [9]:
df_1 = pd.DataFrame({'url': phish_df['phishing_url'], 'label': -1*np.ones(phish_df.shape[0])})
df_2 = pd.DataFrame({'url': nonphish_df.sample(phish_df.shape[0])['url'], 'label': np.ones(phish_df.shape[0])})

In [10]:
df = pd.concat([df_1, df_2]).sample(frac=1, ignore_index=True)

In [11]:
df.head()

Unnamed: 0,url,label
0,https://360zhileng.com,1.0
1,https://sites.google.com/site/verifycheckpoint...,1.0
2,https://hnvoaluong.faqserv.com/ap/signin?p=R%2...,-1.0
3,https://login-maile-924c.query9133.workers.dev/,-1.0
4,http://easywebbnpnortisbe.com/,1.0


### data cleaning

In [12]:
def filter_df(x):
    try:
        scheme = urlparse(str(x))._asdict()['netloc']
        if scheme=='':
            return np.nan
        return x
    except:
        scheme = urlparse(str(x)[:-1])._asdict()['netloc']
        if scheme=='':
            return np.nan
        return x

In [13]:
df.shape

(199846, 2)

In [14]:
df['url'] = df['url'].apply(lambda x: filter_df(x))
df.dropna(inplace=True)

In [15]:
df.shape

(199690, 2)

In [16]:
urlparse(df['url'][1])._asdict(), urlparse('http://198.50.135.149:80/redirecionamento/tv55lgm.php')._asdict()

({'scheme': 'https',
  'netloc': 'sites.google.com',
  'path': '/site/verifycheckpointpaqes/',
  'params': '',
  'query': '',
  'fragment': ''},
 {'scheme': 'http',
  'netloc': '198.50.135.149:80',
  'path': '/redirecionamento/tv55lgm.php',
  'params': '',
  'query': '',
  'fragment': ''})

In [17]:
def parser(x, key, type):
    try:
        scheme = urlparse(str(x))._asdict()[key]
    except:
        scheme = urlparse(str(x)[:-1])._asdict()[key]
    
    return pd.Series([scheme], index=[type])


def parse_domain(x):
    subdomain = None
    domain = None
    tld = None
    try:
        scheme = urlparse(str(x))._asdict()['netloc']
        url = tldextract.extract(scheme)
        subdomain, domain, tld = url.subdomain, url.domain, url.suffix
    except:
        scheme = urlparse(str(x)[:-1])._asdict()['netloc']
        url = tldextract.extract(scheme)
        subdomain, domain, tld = url.subdomain, url.domain, url.suffix
        
    return pd.Series([subdomain, domain, tld], index=['subdomain', 'domain', 'tld'])

In [18]:
df_protocol = df['url'].apply(lambda x: parser(x, 'scheme', 'protocol'))
df_domain = df['url'].apply(lambda x: parse_domain(x))
df_path = df['url'].apply(lambda x: parser(x, 'path', 'path'))
df_url_length = df['url'].apply(lambda x: len(x)).rename('url_length')

In [19]:
df_domain.head()

Unnamed: 0,subdomain,domain,tld
0,,360zhileng,com
1,sites,google,com
2,hnvoaluong,faqserv,com
3,login-maile-924c.query9133,workers,dev
4,,easywebbnpnortisbe,com


In [20]:
df_protocol.head()

Unnamed: 0,protocol
0,https
1,https
2,https
3,https
4,http


In [21]:
df_path.head()

Unnamed: 0,path
0,
1,/site/verifycheckpointpaqes/
2,/ap/signin
3,/
4,/


In [22]:
df.shape, df_protocol.shape, df_domain.shape, df_path.shape, df_url_length.shape

((199690, 2), (199690, 1), (199690, 3), (199690, 1), (199690,))

In [23]:
df.head()

Unnamed: 0,url,label
0,https://360zhileng.com,1.0
1,https://sites.google.com/site/verifycheckpoint...,1.0
2,https://hnvoaluong.faqserv.com/ap/signin?p=R%2...,-1.0
3,https://login-maile-924c.query9133.workers.dev/,-1.0
4,http://easywebbnpnortisbe.com/,1.0


In [24]:
filtered_df = pd.concat([df_protocol, df_domain, df_path, df_url_length, df], axis=1)

In [25]:
filtered_df.head()

Unnamed: 0,protocol,subdomain,domain,tld,path,url_length,url,label
0,https,,360zhileng,com,,22,https://360zhileng.com,1.0
1,https,sites,google,com,/site/verifycheckpointpaqes/,52,https://sites.google.com/site/verifycheckpoint...,1.0
2,https,hnvoaluong,faqserv,com,/ap/signin,217,https://hnvoaluong.faqserv.com/ap/signin?p=R%2...,-1.0
3,https,login-maile-924c.query9133,workers,dev,/,47,https://login-maile-924c.query9133.workers.dev/,-1.0
4,http,,easywebbnpnortisbe,com,/,30,http://easywebbnpnortisbe.com/,1.0


In [26]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

In [27]:
vectors = load_vectors('./fasttext_model/crawl-300d-2M.vec')