In [4]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import tldextract

Usando los datasets obtenidos y generamos podemos unirlos para generar un dataset completo para la detección de phishing.

In [5]:
data_phish_valid = pd.read_csv('Datasets/verified_online.csv')
data_phish_valid['phishing'] = np.ones(data_phish_valid.shape[0])
data_phish_valid = data_phish_valid[['url','phishing']]
data_phish_valid

Unnamed: 0,url,phishing
0,http://populaire-pro-dsp.com/PopIbet/app.html,1.0
1,https://talkingtree.staging-technologies.com/v...,1.0
2,https://yahoopdf.yolasite.com/,1.0
3,https://www4.sndc-crad-nem-inedx.scottfaraday1.cn,1.0
4,https://annzon-ihbsf-co-jp.mbhcxy.buzz/,1.0
...,...,...
11102,http://gkjx168.com/images,1.0
11103,http://www.habbocreditosparati.blogspot.com/,1.0
11104,http://creditiperhabbogratissicuro100.blogspot...,1.0
11105,http://mundovirtualhabbo.blogspot.com/2009_01_...,1.0


In [6]:
data_phish_valid.iloc[60:80]

Unnamed: 0,url,phishing
60,https://modest-cohen.46-20-34-168.plesk.page/r27/,1.0
61,http://serviceagf.temp.swtest.ru/wp-content/la...,1.0
62,https://socialmediamarkettiers.com/wp-kxjk/office,1.0
63,https://socialmediamarkettiers.com/wp-kxjk/off...,1.0
64,http://newxxporn-virals83.duckdns.org/,1.0
65,https://dennybegle.com/netbankar/mkb.hu/mkbnet...,1.0
66,https://dennybegle.com/netbankar/mkb.hu/mkbnet...,1.0
67,https://housestoneltd.com/sal/ggg/china/?login...,1.0
68,https://www.logobirds.com/wp-content/plugins/n...,1.0
69,https://www.logobirds.com/wp-content/plugins/n...,1.0


In [7]:
data_top_urls = pd.read_csv('Datasets/Dataset_Top_urls.csv')
data_top_urls = data_top_urls.iloc[:5000]
data_top_urls['phishing'] = np.zeros(data_top_urls.shape[0])
data_top_urls = data_top_urls[['URL','phishing']]
data_top_urls.columns = ['url', 'phishing']
data_top_urls

Unnamed: 0,url,phishing
0,google.com,0.0
1,youtube.com,0.0
2,facebook.com,0.0
3,baidu.com,0.0
4,wikipedia.org,0.0
...,...,...
4995,uspto.gov,0.0
4996,elastic.co,0.0
4997,vkmag.com,0.0
4998,mtime.com,0.0


Podríamos comparar dominios correctos usando estos top urls contra los dominios de los urls analizados.

In [8]:
data_top_urls['domain'] = data_top_urls['url'].apply(lambda x: tldextract.extract(x).domain)
data_top_urls.iloc[70:90]

Unnamed: 0,url,phishing,domain
70,paypal.com,0.0,paypal
71,microsoftonline.com,0.0,microsoftonline
72,google.com.tw,0.0,google
73,google.com.au,0.0,google
74,whatsapp.com,0.0,whatsapp
75,google.pl,0.0,google
76,xhamster.com,0.0,xhamster
77,detail.tmall.com,0.0,tmall
78,diply.com,0.0,diply
79,google.co.id,0.0,google


In [9]:
data_phish_sus = pd.read_csv('Datasets/Dataset_Phishing_Total.csv')
data_phish_sus['phishing'] = np.zeros(data_phish_sus.shape[0])
data_phish_sus = data_phish_sus[['URL','phishing']]
data_phish_sus.columns = ['url', 'phishing']
data_phish_sus

Unnamed: 0,url,phishing
0,https://www.paysafecard.com/fr-ch/,0.0
1,https://cncs.gob.do,0.0
2,https://eticket.migracion.gob.do/,0.0
3,https://www.indeed.com/legal?hl=en_US#tos,0.0
4,https://www.peoplemetrics.com/,0.0
...,...,...
10115,https://nam01.safelinks.protection.outlook.com...,0.0
10116,https://na01.safelinks.protection.outlook.com/...,0.0
10117,http://vodafone.myinbound.com/,0.0
10118,https://nam03.safelinks.protection.outlook.com...,0.0


Generamos el dataset completo y mezclamos las observaciones.

In [10]:
# Pruebo sin incluir top_urls
data = pd.concat([data_phish_sus, data_phish_valid])
data = data.sample(frac=1, random_state = 42).reset_index(drop=True)
#data.to_csv('Dataset_URL.csv')
data.head(10)

Unnamed: 0,url,phishing
0,https://www.powr.io/form-builder/i/27476566#page,1.0
1,http://mjaymu1hetezmtj0aa.filesusr.com/html/c6...,1.0
2,https://www.google.com/webhp?sxsrf=ACYBGNSlZQV...,0.0
3,https://sites.google.com/view/xcccjcdhasks/btc...,1.0
4,http://clouddoc-authorize.firebaseapp.com/...x...,1.0
5,https://rule.alibaba.com/rule/detail/2041.htm?...,0.0
6,http://157.245.101.68/wp-admin/cs/office2020/o...,1.0
7,https://siemik.github.io/lp_axa_34/,1.0
8,http://bit.do/fRb8y,1.0
9,https://itau.negocie-aqui.com/c/JWgn36iqE,0.0


In [11]:
data['phishing'].value_counts()

1.0    11107
0.0    10120
Name: phishing, dtype: int64

## Variable Scheme

Los 5000 urls obtenidos de alexa no tienen esta porción. Estos links podrían introducir sesgos en el análisis. Podemos analizar estos links que tienen 'spam' como scheme, pero en principio son observaciones que no se incluirían.

In [12]:
data['scheme'] = data['url'].apply(lambda x: urlparse(x).scheme)

In [13]:
data['scheme'].value_counts()

https    13051
http      8162
spams        8
spam         6
Name: scheme, dtype: int64

In [14]:
pd.crosstab(index = data['phishing'], columns = data['scheme'])

scheme,http,https,spam,spams
phishing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,3271,6835,6,8
1.0,4891,6216,0,0


## Variable Domain

Extraemos el dominio completo de los urls analizados.

In [15]:
data['domain_complete'] = data['url'].apply(lambda x: urlparse(x).netloc)

Hay dos urls que no parecerían tener dominio.

In [16]:
(data['domain_complete'] == '').sum()

2

In [17]:
data.head()

Unnamed: 0,url,phishing,scheme,domain_complete
0,https://www.powr.io/form-builder/i/27476566#page,1.0,https,www.powr.io
1,http://mjaymu1hetezmtj0aa.filesusr.com/html/c6...,1.0,http,mjaymu1hetezmtj0aa.filesusr.com
2,https://www.google.com/webhp?sxsrf=ACYBGNSlZQV...,0.0,https,www.google.com
3,https://sites.google.com/view/xcccjcdhasks/btc...,1.0,https,sites.google.com
4,http://clouddoc-authorize.firebaseapp.com/...x...,1.0,http,clouddoc-authorize.firebaseapp.com


In [18]:
data['domain_complete']

0                               www.powr.io
1           mjaymu1hetezmtj0aa.filesusr.com
2                            www.google.com
3                          sites.google.com
4        clouddoc-authorize.firebaseapp.com
                        ...                
21222          grubbokepnew2021.duckdns.org
21223     xserver-ne-jp.omegapediatrics.com
21224                       games-box.do.am
21225                   underthedoormat.com
21226         www.beloanvi333.byethost7.com
Name: domain_complete, Length: 21227, dtype: object

Del dominio completo se puede extraer el sufijo, el dominio y el subdominio. 

* Con el sufijo se puede armar una variable categórica. 
* Revisar el dominio y subdominio (se podrían analizar juntos o separados).

In [19]:
data['domain'] = data['domain_complete'].apply(lambda x: tldextract.extract(x).domain)
data['subdomain'] = data['domain_complete'].apply(lambda x: tldextract.extract(x).subdomain)
data['suffix'] = data['domain_complete'].apply(lambda x: tldextract.extract(x).suffix)

Podemos eliminar los prefijos "www." de el subdomain y crear una variable de domain + subdomain. Esto se puede realizar porque "entiendo" que todos los urls se pueden escribir con o sin 'www' por lo que incluirlos en el análisis puede afectar innecesariamente.

In [20]:
data['subdomain'] = data['subdomain'].str.replace('www.', '')
data['subdomain'] = data['subdomain'].str.replace('www', '')

  data['subdomain'] = data['subdomain'].str.replace('www.', '')


In [21]:
data['subdomain'] = data['subdomain'] + '.'
data['subdomain'] = data['subdomain'].replace('.', '')

In [22]:
data['domain_subdomain'] = data['subdomain'] + data['domain']

Para el sufijo se pueden generar variables categóricas incluyendo a todas las clases poco frecuentes en una categoría "Otros".

In [23]:
data['suffix'].value_counts()

com           11238
net            1073
ru              784
org             770
com.br          323
              ...  
ps                1
info.hu           1
ink               1
na                1
industries        1
Name: suffix, Length: 386, dtype: int64

## Variable Path

In [24]:
data['path'] = data['url'].apply(lambda x: urlparse(x).path)

In [25]:
print((data['path'] == '').sum())
print((data['path'] == '/').sum())

899
5188


In [26]:
data['url'].str.count('\\=').value_counts()

0     16006
1      2438
2       731
3       530
4       510
6       257
5       256
7       145
8        72
10       61
9        60
11       48
12       42
13       19
20       13
14       12
17        6
15        5
25        5
18        4
16        2
19        2
24        1
21        1
23        1
Name: url, dtype: int64

In [27]:
# Variables del dominio
# Cuenta los puntos
data['domain_subdomain'].str.count('\\.')
data['domain_subdomain'].str.count('\\-')
# Cuenta el largo
data['domain_subdomain'].str.len()
# Cuenta vocales
data['domain_subdomain'].str.lower().str.count(r'[aeiou]')
# Cuenta consonantes
data['domain_subdomain'].str.lower().str.count(r'[a-z]') - data['domain_vocales']
# Cuenta números
data['domain_subdomain'].str.count('\d')

KeyError: 'domain_vocales'

In [None]:
data['domain_subdomain'].str.count('\\.').value_counts()

In [None]:
data.iloc[40:60]

# Prueba Distancia de Strings

In [None]:
import jellyfish

Link de la documentación de la libreria.

https://jellyfish.readthedocs.io/en/latest/comparison.html

Link de la explicación de Jaro Distance.

https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance


In [None]:
links_url = [
                'amazon.com', 
                'amazon.co.safeamazonsecure.icu', 
                'aoinamozm.servebeer.com', 
                'breakevents.de', 
                'spectralwirejewelry.com', 
                'eim.ae.iwc.static.royalgatetransport.ae',
                'www.amazonlogistics.eu',
                'www.amazon.fr'
            ]

for link in links_url:
    print(tldextract.extract(link))

La mejor métrica de similitud entre strings parece ser la distancia Jaro - Winkler.

In [None]:
for link in links_url:
    print(f'Link: {link} - metric {jellyfish.jaro_distance(tldextract.extract(link).domain+tldextract.extract(link).subdomain, "amazon")}')

In [None]:
for link in links_url:
    print(f'Link: {link} - metric {jellyfish.jaro_winkler(tldextract.extract(link).domain+tldextract.extract(link).subdomain, "amazon")}')

In [None]:
for link in links_url:
    print(f'Link: {link} - metric {jellyfish.hamming_distance(link, "amazon.com")}')

In [None]:
for link in links_url:
    print(f'Link: {link} - metric {jellyfish.levenshtein_distance(link, "amazon.com")}')

In [None]:
for link in links_url:
    print(f'Link: {link} - metric {jellyfish.damerau_levenshtein_distance(link, "amazon.com")}')

In [None]:
print(jellyfish.jaro_distance('amazon.com', 'amazon.com'))
print(jellyfish.jaro_distance('amazon.co.safeamazonsecure.icu', 'amazon.com'))
print(jellyfish.jaro_distance('aoinamozm.servebeer.com', 'amazon.com'))
print(jellyfish.jaro_distance('breakevents.de', 'amazon.com'))
print(jellyfish.jaro_distance('spectralwirejewelry.com', 'amazon.com'))
print(jellyfish.jaro_distance('eim.ae.iwc.static.royalgatetransport.ae', 'amazon.com'))
print(jellyfish.jaro_distance('amazon.co.jp', 'amazon.com'))
print(jellyfish.jaro_distance('www.amazonlogistics.eu', 'amazon.com'))
print(jellyfish.jaro_distance('www.amazon.fr', 'amazon.com'))

In [None]:
row = 40
data.iloc[row:row+20]

In [None]:
data_top_urls.head(20)

In [None]:
data_top_urls['domain']

In [None]:
metric_domains = ['amazon', 'google', 'whatsapp', 'instagram', 'twitter', 'facebook', 'yahoo']

for domain in metric_domains:
    data['metric_'+domain] = data['domain_subdomain'].apply(lambda x: jellyfish.jaro_distance(x, domain))

In [None]:
domain_check = 'whatsapp'
row = 20
data[['phishing', 'domain_subdomain', 'metric_'+domain_check, 'url']].sort_values('metric_'+domain_check, ascending = False).iloc[row:row+20]

In [None]:
data[['phishing', 'domain_subdomain', 'metric_whatsapp']].sort_values('metric_whatsapp', ascending = False).iloc[0:20]