In [1]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import tldextract

Usando los datasets obtenidos y generamos podemos unirlos para generar un dataset completo para la detección de phishing.

In [2]:
data_phish_valid = pd.read_csv('Datasets/verified_online.csv')
data_phish_valid['phishing'] = np.ones(data_phish_valid.shape[0])
data_phish_valid = data_phish_valid[['url','phishing']]
data_phish_valid

Unnamed: 0,url,phishing
0,http://populaire-pro-dsp.com/PopIbet/app.html,1.0
1,https://talkingtree.staging-technologies.com/v...,1.0
2,https://yahoopdf.yolasite.com/,1.0
3,https://www4.sndc-crad-nem-inedx.scottfaraday1.cn,1.0
4,https://annzon-ihbsf-co-jp.mbhcxy.buzz/,1.0
...,...,...
11102,http://gkjx168.com/images,1.0
11103,http://www.habbocreditosparati.blogspot.com/,1.0
11104,http://creditiperhabbogratissicuro100.blogspot...,1.0
11105,http://mundovirtualhabbo.blogspot.com/2009_01_...,1.0


In [3]:
data_phish_valid.iloc[60:80]

Unnamed: 0,url,phishing
60,https://modest-cohen.46-20-34-168.plesk.page/r27/,1.0
61,http://serviceagf.temp.swtest.ru/wp-content/la...,1.0
62,https://socialmediamarkettiers.com/wp-kxjk/office,1.0
63,https://socialmediamarkettiers.com/wp-kxjk/off...,1.0
64,http://newxxporn-virals83.duckdns.org/,1.0
65,https://dennybegle.com/netbankar/mkb.hu/mkbnet...,1.0
66,https://dennybegle.com/netbankar/mkb.hu/mkbnet...,1.0
67,https://housestoneltd.com/sal/ggg/china/?login...,1.0
68,https://www.logobirds.com/wp-content/plugins/n...,1.0
69,https://www.logobirds.com/wp-content/plugins/n...,1.0


In [4]:
data_top_urls = pd.read_csv('Datasets/Dataset_Top_urls.csv')
data_top_urls = data_top_urls.iloc[:5000]
data_top_urls['phishing'] = np.zeros(data_top_urls.shape[0])
data_top_urls = data_top_urls[['URL','phishing']]
data_top_urls.columns = ['url', 'phishing']
data_top_urls

Unnamed: 0,url,phishing
0,google.com,0.0
1,youtube.com,0.0
2,facebook.com,0.0
3,baidu.com,0.0
4,wikipedia.org,0.0
...,...,...
4995,uspto.gov,0.0
4996,elastic.co,0.0
4997,vkmag.com,0.0
4998,mtime.com,0.0


Podríamos comparar dominios correctos usando estos top urls contra los dominios de los urls analizados.

In [5]:
data_top_urls['domain'] = data_top_urls['url'].apply(lambda x: tldextract.extract(x).domain)
data_top_urls.iloc[70:90]

Unnamed: 0,url,phishing,domain
70,paypal.com,0.0,paypal
71,microsoftonline.com,0.0,microsoftonline
72,google.com.tw,0.0,google
73,google.com.au,0.0,google
74,whatsapp.com,0.0,whatsapp
75,google.pl,0.0,google
76,xhamster.com,0.0,xhamster
77,detail.tmall.com,0.0,tmall
78,diply.com,0.0,diply
79,google.co.id,0.0,google


In [6]:
data_phish_sus = pd.read_csv('Datasets/Dataset_Phishing_Total.csv')
data_phish_sus['phishing'] = np.zeros(data_phish_sus.shape[0])
data_phish_sus = data_phish_sus[['URL','phishing']]
data_phish_sus.columns = ['url', 'phishing']
data_phish_sus

Unnamed: 0,url,phishing
0,https://www.paysafecard.com/fr-ch/,0.0
1,https://cncs.gob.do,0.0
2,https://eticket.migracion.gob.do/,0.0
3,https://www.indeed.com/legal?hl=en_US#tos,0.0
4,https://www.peoplemetrics.com/,0.0
...,...,...
10115,https://nam01.safelinks.protection.outlook.com...,0.0
10116,https://na01.safelinks.protection.outlook.com/...,0.0
10117,http://vodafone.myinbound.com/,0.0
10118,https://nam03.safelinks.protection.outlook.com...,0.0


Generamos el dataset completo y mezclamos las observaciones.

In [57]:
# Pruebo sin incluir top_urls
data = pd.concat([data_phish_sus, data_phish_valid])
data = data.sample(frac=1, random_state = 42).reset_index(drop=True)
#data.to_csv('Dataset_URL.csv')
data.head(10)

Unnamed: 0,url,phishing
0,https://www.powr.io/form-builder/i/27476566#page,1.0
1,http://mjaymu1hetezmtj0aa.filesusr.com/html/c6...,1.0
2,https://www.google.com/webhp?sxsrf=ACYBGNSlZQV...,0.0
3,https://sites.google.com/view/xcccjcdhasks/btc...,1.0
4,http://clouddoc-authorize.firebaseapp.com/...x...,1.0
5,https://rule.alibaba.com/rule/detail/2041.htm?...,0.0
6,http://157.245.101.68/wp-admin/cs/office2020/o...,1.0
7,https://siemik.github.io/lp_axa_34/,1.0
8,http://bit.do/fRb8y,1.0
9,https://itau.negocie-aqui.com/c/JWgn36iqE,0.0


In [8]:
data['phishing'].value_counts()

1.0    11107
0.0    10120
Name: phishing, dtype: int64

## Variable Scheme

Los 5000 urls obtenidos de alexa no tienen esta porción. Estos links podrían introducir sesgos en el análisis. Podemos analizar estos links que tienen 'spam' como scheme, pero en principio son observaciones que no se incluirían.

In [9]:
data['scheme'] = data['url'].apply(lambda x: urlparse(x).scheme)

In [10]:
data['scheme'].value_counts()

https    13051
http      8162
spams        8
spam         6
Name: scheme, dtype: int64

In [11]:
pd.crosstab(index = data['phishing'], columns = data['scheme'])

scheme,http,https,spam,spams
phishing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,3271,6835,6,8
1.0,4891,6216,0,0


## Variable Domain

Extraemos el dominio completo de los urls analizados.

In [12]:
data['domain_complete'] = data['url'].apply(lambda x: urlparse(x).netloc)

Hay dos urls que no parecerían tener dominio.

In [13]:
(data['domain_complete'] == '').sum()

2

In [26]:
data.head()

Unnamed: 0,url,phishing,scheme,domain_complete,domain,subdomain,suffix,domain_subdomain,path
0,https://www.powr.io/form-builder/i/27476566#page,1.0,https,www.powr.io,powr,,io,powr,/form-builder/i/27476566
1,http://mjaymu1hetezmtj0aa.filesusr.com/html/c6...,1.0,http,mjaymu1hetezmtj0aa.filesusr.com,filesusr,mjaymu1hetezmtj0aa.,com,mjaymu1hetezmtj0aa.filesusr,/html/c69417_3069841d505568614ed8bca153fc7adf....
2,https://www.google.com/webhp?sxsrf=ACYBGNSlZQV...,0.0,https,www.google.com,google,,com,google,/webhp
3,https://sites.google.com/view/xcccjcdhasks/btc...,1.0,https,sites.google.com,google,sites.,com,sites.google,/view/xcccjcdhasks/btconnect
4,http://clouddoc-authorize.firebaseapp.com/...x...,1.0,http,clouddoc-authorize.firebaseapp.com,firebaseapp,clouddoc-authorize.,com,clouddoc-authorize.firebaseapp,/...xxx........./


In [14]:
data['domain_complete']

0                               www.powr.io
1           mjaymu1hetezmtj0aa.filesusr.com
2                            www.google.com
3                          sites.google.com
4        clouddoc-authorize.firebaseapp.com
                        ...                
21222          grubbokepnew2021.duckdns.org
21223     xserver-ne-jp.omegapediatrics.com
21224                       games-box.do.am
21225                   underthedoormat.com
21226         www.beloanvi333.byethost7.com
Name: domain_complete, Length: 21227, dtype: object

Del dominio completo se puede extraer el sufijo, el dominio y el subdominio. 

* Con el sufijo se puede armar una variable categórica. 
* Revisar el dominio y subdominio (se podrían analizar juntos o separados).

In [16]:
data['domain'] = data['domain_complete'].apply(lambda x: tldextract.extract(x).domain)
data['subdomain'] = data['domain_complete'].apply(lambda x: tldextract.extract(x).subdomain)
data['suffix'] = data['domain_complete'].apply(lambda x: tldextract.extract(x).suffix)

Podemos eliminar los prefijos "www." de el subdomain y crear una variable de domain + subdomain. Esto se puede realizar porque "entiendo" que todos los urls se pueden escribir con o sin 'www' por lo que incluirlos en el análisis puede afectar innecesariamente.

In [18]:
data['subdomain'] = data['subdomain'].str.replace('www.', '')
data['subdomain'] = data['subdomain'].str.replace('www', '')

In [19]:
data['subdomain'] = data['subdomain'] + '.'
data['subdomain'] = data['subdomain'].replace('.', '')

In [20]:
data['domain_subdomain'] = data['subdomain'] + data['domain']

Para el sufijo se pueden generar variables categóricas incluyendo a todas las clases poco frecuentes en una categoría "Otros".

In [22]:
data['suffix'].value_counts()

com       11238
net        1073
ru          784
org         770
com.br      323
          ...  
media         1
ls            1
pub           1
nc            1
sydney        1
Name: suffix, Length: 386, dtype: int64

## Variable Path

In [23]:
data['path'] = data['url'].apply(lambda x: urlparse(x).path)

In [24]:
print((data['path'] == '').sum())
print((data['path'] == '/').sum())

899
5188


In [25]:
data['url'].str.count('\\=').value_counts()

0     16006
1      2438
2       731
3       530
4       510
6       257
5       256
7       145
8        72
10       61
9        60
11       48
12       42
13       19
20       13
14       12
17        6
25        5
15        5
18        4
19        2
16        2
21        1
23        1
24        1
Name: url, dtype: int64

In [249]:
# Variables del dominio
# Cuenta los puntos
data['domain_subdomain'].str.count('\\.')
data['domain_subdomain'].str.count('\\-')
# Cuenta el largo
data['domain_subdomain'].str.len()
# Cuenta vocales
data['domain_subdomain'].str.lower().str.count(r'[aeiou]')
# Cuenta consonantes
data['domain_subdomain'].str.lower().str.count(r'[a-z]') - data['domain_vocales']
# Cuenta números
data['domain_subdomain'].str.count('\d')

0     17232
1      1089
2       740
3       611
4       355
6       218
7       160
10      153
5       144
8       109
12      106
11      103
9        87
13       19
14       17
15       14
22       13
27       12
21        9
19        8
18        6
23        5
28        3
17        3
20        2
34        2
16        2
24        1
36        1
25        1
30        1
31        1
Name: domain_subdomain, dtype: int64

In [30]:
data['domain_subdomain'].str.count('\\.').value_counts()

0     12174
1      6790
2      1058
3       728
5       238
6       148
4        87
10        2
7         1
18        1
Name: domain_subdomain, dtype: int64

In [240]:
data.iloc[40:60]

Unnamed: 0,url,phishing,scheme,domain_complete,domain,subdomain,suffix,domain_subdomain,path,metric_amazon,metric_paypal,metric_whatsapp,metric_google,metric_instagram,metric_twitter,metric_facebook,metric_yahoo
40,https://vipchanger.com,0.0,https,vipchanger.com,vipchanger,,com,vipchanger,,0.511111,0.511111,0.447222,0.511111,0.614815,0.57619,0.316667,0.366667
41,http://myee-billing-info.com/,1.0,http,myee-billing-info.com,myee-billing-info,,com,myee-billing-info,/,0.48366,0.48366,0.0,0.316993,0.446623,0.30112,0.455882,0.419608
42,http://bupsdd.com/NEW/amazon/,1.0,http,bupsdd.com,bupsdd,,com,bupsdd,/NEW/amazon/,0.0,0.444444,0.430556,0.0,0.425926,0.0,0.0,0.0
43,http://servernuovaintesa.com/,1.0,http,servernuovaintesa.com,servernuovaintesa,,com,servernuovaintesa,/,0.316993,0.408497,0.455882,0.316993,0.393246,0.467787,0.455882,0.419608
44,https://www.disneyworld.eu/profile/,0.0,https,www.disneyworld.eu,disneyworld,,eu,disneyworld,/profile/,0.338384,0.505051,0.405303,0.419192,0.51936,0.5671,0.477273,0.430303
45,https://www.orange.jo/ar/pages/default.aspx,0.0,https,www.orange.jo,orange,,jo,orange,/ar/pages/default.aspx,0.555556,0.444444,0.430556,0.666667,0.5,0.436508,0.527778,0.455556
46,https://www.m.psservlces.runescape.com-m.ru/,1.0,https,www.m.psservlces.runescape.com-m.ru,com-m,m.psservlces.runescape.,ru,m.psservlces.runescape.com-m,/,0.468254,0.468254,0.382937,0.301587,0.362434,0.452381,0.27381,0.0
47,https://www.proprofs.com/survey/preview.php?ti...,1.0,https,www.proprofs.com,proprofs,,com,proprofs,/survey/preview.php,0.430556,0.527778,0.333333,0.527778,0.412037,0.422619,0.5,0.55
48,http://192.254.66.115/,0.0,http,192.254.66.115,192.254.66.115,,,192.254.66.115,/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49,https://support.arcticwolf.com/attachments/tok...,0.0,https,support.arcticwolf.com,arcticwolf,support.,com,support.arcticwolf,/attachments/token/vWE7IepA5YfH7mkeignKMyZTO/,0.314815,0.444444,0.500926,0.407407,0.472222,0.420635,0.402778,0.337037


# Prueba Distancia de Strings

In [51]:
import jellyfish

Link de la documentación de la libreria.

https://jellyfish.readthedocs.io/en/latest/comparison.html

Link de la explicación de Jaro Distance.

https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance


In [53]:
links_url = [
                'amazon.com', 
                'amazon.co.safeamazonsecure.icu', 
                'aoinamozm.servebeer.com', 
                'breakevents.de', 
                'spectralwirejewelry.com', 
                'eim.ae.iwc.static.royalgatetransport.ae',
                'www.amazonlogistics.eu',
                'www.amazon.fr'
            ]

for link in links_url:
    print(tldextract.extract(link))

ExtractResult(subdomain='', domain='amazon', suffix='com')
ExtractResult(subdomain='amazon.co', domain='safeamazonsecure', suffix='icu')
ExtractResult(subdomain='aoinamozm', domain='servebeer', suffix='com')
ExtractResult(subdomain='', domain='breakevents', suffix='de')
ExtractResult(subdomain='', domain='spectralwirejewelry', suffix='com')
ExtractResult(subdomain='eim.ae.iwc.static', domain='royalgatetransport', suffix='ae')
ExtractResult(subdomain='www', domain='amazonlogistics', suffix='eu')
ExtractResult(subdomain='www', domain='amazon', suffix='fr')


La mejor métrica de similitud entre strings parece ser la distancia Jaro - Winkler.

In [54]:
for link in links_url:
    print(f'Link: {link} - metric {jellyfish.jaro_distance(tldextract.extract(link).domain+tldextract.extract(link).subdomain, "amazon")}')

Link: amazon.com - metric 1.0
Link: amazon.co.safeamazonsecure.icu - metric 0.6911111111111111
Link: aoinamozm.servebeer.com - metric 0.5555555555555555
Link: breakevents.de - metric 0.5050505050505051
Link: spectralwirejewelry.com - metric 0.4064327485380117
Link: eim.ae.iwc.static.royalgatetransport.ae - metric 0.5103174603174603
Link: www.amazonlogistics.eu - metric 0.7777777777777777
Link: www.amazon.fr - metric 0.8888888888888888


In [55]:
for link in links_url:
    print(f'Link: {link} - metric {jellyfish.jaro_winkler(tldextract.extract(link).domain+tldextract.extract(link).subdomain, "amazon")}')

Link: amazon.com - metric 1.0
Link: amazon.co.safeamazonsecure.icu - metric 0.6911111111111111
Link: aoinamozm.servebeer.com - metric 0.5555555555555555
Link: breakevents.de - metric 0.5050505050505051
Link: spectralwirejewelry.com - metric 0.4064327485380117
Link: eim.ae.iwc.static.royalgatetransport.ae - metric 0.5103174603174603
Link: www.amazonlogistics.eu - metric 0.8666666666666666
Link: www.amazon.fr - metric 0.9333333333333333


In [91]:
for link in links_url:
    print(f'Link: {link} - metric {jellyfish.hamming_distance(link, "amazon.com")}')

Link: amazon.com - metric 0
Link: amazon.co.safeamazonsecure.icu - metric 21
Link: aoinamozm.servebeer.com - metric 22
Link: breakevents.de - metric 14
Link: spectralwirejewelry.com - metric 23
Link: eim.ae.iwc.static.royalgatetransport.ae - metric 38
Link: www.amazonlogistics.eu - metric 21
Link: www.amazon.fr - metric 12


In [88]:
for link in links_url:
    print(f'Link: {link} - metric {jellyfish.levenshtein_distance(link, "amazon.com")}')

Link: amazon.com - metric 0
Link: amazon.co.safeamazonsecure.icu - metric 20
Link: aoinamozm.servebeer.com - metric 16
Link: breakevents.de - metric 12
Link: spectralwirejewelry.com - metric 18
Link: eim.ae.iwc.static.royalgatetransport.ae - metric 34
Link: www.amazonlogistics.eu - metric 15
Link: www.amazon.fr - metric 7


In [89]:
for link in links_url:
    print(f'Link: {link} - metric {jellyfish.damerau_levenshtein_distance(link, "amazon.com")}')

Link: amazon.com - metric 0
Link: amazon.co.safeamazonsecure.icu - metric 20
Link: aoinamozm.servebeer.com - metric 16
Link: breakevents.de - metric 12
Link: spectralwirejewelry.com - metric 18
Link: eim.ae.iwc.static.royalgatetransport.ae - metric 34
Link: www.amazonlogistics.eu - metric 15
Link: www.amazon.fr - metric 7


In [71]:
print(jellyfish.jaro_distance('amazon.com', 'amazon.com'))
print(jellyfish.jaro_distance('amazon.co.safeamazonsecure.icu', 'amazon.com'))
print(jellyfish.jaro_distance('aoinamozm.servebeer.com', 'amazon.com'))
print(jellyfish.jaro_distance('breakevents.de', 'amazon.com'))
print(jellyfish.jaro_distance('spectralwirejewelry.com', 'amazon.com'))
print(jellyfish.jaro_distance('eim.ae.iwc.static.royalgatetransport.ae', 'amazon.com'))
print(jellyfish.jaro_distance('amazon.co.jp', 'amazon.com'))
print(jellyfish.jaro_distance('www.amazonlogistics.eu', 'amazon.com'))
print(jellyfish.jaro_distance('www.amazon.fr', 'amazon.com'))

1.0
0.7777777777777777
0.6156199677938808
0.5047619047619047
0.26231884057971017
0.47350427350427343
0.8833333333333333
0.6215488215488215
0.6032967032967033


In [186]:
row = 40
data.iloc[row:row+20]

Unnamed: 0,url,phishing,scheme,domain_complete,domain,subdomain,suffix,domain_subdomain,path,metric_amazon,metric_paypal,metric_whatsapp,metric_google,metric_instagram,metric_twitter,metric_facebook,metric_yahoo
40,https://vipchanger.com,0.0,https,vipchanger.com,vipchanger,,com,vipchanger,,0.511111,0.511111,0.447222,0.511111,0.614815,0.57619,0.316667,0.366667
41,http://myee-billing-info.com/,1.0,http,myee-billing-info.com,myee-billing-info,,com,myee-billing-info,/,0.48366,0.48366,0.0,0.316993,0.446623,0.30112,0.455882,0.419608
42,http://bupsdd.com/NEW/amazon/,1.0,http,bupsdd.com,bupsdd,,com,bupsdd,/NEW/amazon/,0.0,0.444444,0.430556,0.0,0.425926,0.0,0.0,0.0
43,http://servernuovaintesa.com/,1.0,http,servernuovaintesa.com,servernuovaintesa,,com,servernuovaintesa,/,0.316993,0.408497,0.455882,0.316993,0.393246,0.467787,0.455882,0.419608
44,https://www.disneyworld.eu/profile/,0.0,https,www.disneyworld.eu,disneyworld,,eu,disneyworld,/profile/,0.338384,0.505051,0.405303,0.419192,0.51936,0.5671,0.477273,0.430303
45,https://www.orange.jo/ar/pages/default.aspx,0.0,https,www.orange.jo,orange,,jo,orange,/ar/pages/default.aspx,0.555556,0.444444,0.430556,0.666667,0.5,0.436508,0.527778,0.455556
46,https://www.m.psservlces.runescape.com-m.ru/,1.0,https,www.m.psservlces.runescape.com-m.ru,com-m,m.psservlces.runescape.,ru,m.psservlces.runescape.com-m,/,0.468254,0.468254,0.382937,0.301587,0.362434,0.452381,0.27381,0.0
47,https://www.proprofs.com/survey/preview.php?ti...,1.0,https,www.proprofs.com,proprofs,,com,proprofs,/survey/preview.php,0.430556,0.527778,0.333333,0.527778,0.412037,0.422619,0.5,0.55
48,http://192.254.66.115/,0.0,http,192.254.66.115,192.254.66.115,,,192.254.66.115,/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49,https://support.arcticwolf.com/attachments/tok...,0.0,https,support.arcticwolf.com,arcticwolf,support.,com,support.arcticwolf,/attachments/token/vWE7IepA5YfH7mkeignKMyZTO/,0.314815,0.444444,0.500926,0.407407,0.472222,0.420635,0.402778,0.337037


In [58]:
data_top_urls.head(20)

Unnamed: 0,url,phishing,domain
0,google.com,0.0,google
1,youtube.com,0.0,youtube
2,facebook.com,0.0,facebook
3,baidu.com,0.0,baidu
4,wikipedia.org,0.0,wikipedia
5,yahoo.com,0.0,yahoo
6,google.co.in,0.0,google
7,reddit.com,0.0,reddit
8,qq.com,0.0,qq
9,amazon.com,0.0,amazon


In [49]:
data_top_urls['domain']

0             google
1            youtube
2           facebook
3              baidu
4          wikipedia
            ...     
4995           uspto
4996         elastic
4997           vkmag
4998           mtime
4999    mercadolibre
Name: domain, Length: 5000, dtype: object

In [59]:
metric_domains = ['amazon', 'google', 'whatsapp', 'instagram', 'twitter', 'facebook', 'yahoo']

for domain in metric_domains:
    data['metric_'+domain] = data['domain_subdomain'].apply(lambda x: jellyfish.jaro_distance(x, domain))

KeyError: 'domain_subdomain'

In [56]:
domain_check = 'whatsapp'
row = 20
data[['phishing', 'domain_subdomain', 'metric_'+domain_check, 'url']].sort_values('metric_'+domain_check, ascending = False).iloc[row:row+20]

Unnamed: 0,phishing,domain_subdomain,metric_whatsapp,url
7402,1.0,join-whatsappk8wh.xxuz,0.787879,http://join-whatsappk8wh.xxuz.com/
2788,1.0,whatsapp-grup93.duckdns,0.782609,https://whatsapp-grup93.duckdns.org/
21169,1.0,alsnapp,0.779762,http://www.alsnapp.com/AF/
15377,1.0,whatshapgrup2021.duckdns,0.777778,http://whatshapgrup2021.duckdns.org/
546,0.0,whatsapp.fr.downloadastro,0.773333,https://whatsapp.fr.downloadastro.com/
10555,1.0,join-whatapp.otzo,0.762255,http://www.join-whatapp.otzo.com/
10163,1.0,grubwhatsapp-hot2021.duckdns,0.761905,http://grubwhatsapp-hot2021.duckdns.org/
15486,1.0,grub-whatsapp18viral.duckdns,0.761905,http://grub-whatsapp18viral.duckdns.org/
5812,0.0,whatsapp-recovery.en.softonic,0.758621,https://whatsapp-recovery.en.softonic.com/
13636,0.0,whatsapp-messenger.id.uptodown,0.755556,https://whatsapp-messenger.id.uptodown.com/and...


In [245]:
data[['phishing', 'domain_subdomain', 'metric_whatsapp']].sort_values('metric_whatsapp', ascending = False).iloc[0:20]

Unnamed: 0,phishing,domain_subdomain,metric_whatsapp
16250,0.0,whatsapp,1.0
11044,0.0,whatsapp,1.0
3357,0.0,whatsappturk,0.888889
5737,0.0,web.whatsapp,0.888889
14747,0.0,web.whatsapp,0.888889
20315,0.0,web.whatsapp,0.888889
2153,0.0,web.whatsapp,0.888889
5935,0.0,whatsapp-free,0.871795
6409,0.0,whatsappstatus,0.857143
16750,1.0,whatsapp-18.ikwb,0.833333
