In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [36]:
data = pd.read_csv("./data/malicious_phish.csv")
data

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement
...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing
651188,www.gamespot.com/xbox360/action/deadspace/,phishing
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing


In [37]:
import plotly.express as px

v_c = data.groupby("type").count()

fig = px.bar(v_c, x=v_c.index, y="url")
fig.show()


# Data exploration and formating

## Email length

In [38]:
data["length"] = data["url"].str.len()
data

Unnamed: 0,url,type,length
0,br-icloud.com.br,phishing,16
1,mp3raid.com/music/krizz_kaliko.html,benign,35
2,bopsecrets.org/rexroth/cr/1.htm,benign,31
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235
...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,39
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,44
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,42
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45


In [39]:
v_c = data.groupby("length").count()
v_c = v_c[v_c.url > 700]
fig = px.bar(v_c, x=v_c.index, y="url")
fig.show()

In [40]:
v_c = data.groupby(["length", "type"]).count()
v_c["type"] = [v_c.index.values[i][1] for i in range(len(v_c))]
v_c

Unnamed: 0_level_0,Unnamed: 1_level_0,url,type
length,type,Unnamed: 2_level_1,Unnamed: 3_level_1
1,phishing,1,phishing
2,phishing,2,phishing
4,phishing,1,phishing
5,phishing,4,phishing
6,benign,1,benign
...,...,...,...
1641,phishing,1,phishing
1696,benign,1,benign
1779,phishing,1,phishing
2081,benign,1,benign


In [41]:
temp = v_c.iloc[:600]
fig = px.bar(temp, x=[v_c.index.values[i][0] for i in range(len(temp))], y="url", color = "type")
fig.show()

# Letters

In [42]:
data["letters"] = data.url.str.count(r"\w")
data

Unnamed: 0,url,type,length,letters
0,br-icloud.com.br,phishing,16,13
1,mp3raid.com/music/krizz_kaliko.html,benign,35,31
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,26
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,72
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,222
...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,39,33
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,44,36
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,42,36
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,39


# Numbers

In [43]:
data["numbers"] = data.url.str.count(r"\d")
data

Unnamed: 0,url,type,length,letters,numbers
0,br-icloud.com.br,phishing,16,13,0
1,mp3raid.com/music/krizz_kaliko.html,benign,35,31,1
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,26,1
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,72,7
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,222,22
...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,39,33,12
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,44,36,7
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,42,36,3
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,39,0


## special char

In [44]:
import re
data["n_special_char"] = data.url.str.replace(r"\w", '', regex = True).str.len()
data

Unnamed: 0,url,type,length,letters,numbers,n_special_char
0,br-icloud.com.br,phishing,16,13,0,3
1,mp3raid.com/music/krizz_kaliko.html,benign,35,31,1,4
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,26,1,5
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,72,7,16
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,222,22,13
...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,39,33,12,6
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,44,36,7,8
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,42,36,3,6
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,39,0,6


In [45]:
v_c = data.groupby(["n_special_char", "type"]).count()
v_c["type"] = [v_c.index.values[i][1] for i in range(len(v_c))]
temp = v_c.iloc[:100]
fig = px.bar(temp, x=[v_c.index.values[i][0] for i in range(len(temp))], y="url", color = "type")
fig.show()

In [46]:
from urllib.parse import urlparse

def is_https(url):
    scheme = urlparse(url).scheme
    if scheme == 'https':
        return 1
    else:
        return 0

In [47]:
data["https"] = data.url.apply(is_https)
data

Unnamed: 0,url,type,length,letters,numbers,n_special_char,https
0,br-icloud.com.br,phishing,16,13,0,3,0
1,mp3raid.com/music/krizz_kaliko.html,benign,35,31,1,4,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,26,1,5,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,72,7,16,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,222,22,13,0
...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,39,33,12,6,0
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,44,36,7,8,0
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,42,36,3,6,0
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,39,0,6,0


In [48]:
def is_shortenend(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net',
                      url)
    if match:
        return 1
    else:
        return 0

In [49]:
data["short"] = data.url.apply(is_shortenend)

In [50]:
data

Unnamed: 0,url,type,length,letters,numbers,n_special_char,https,short
0,br-icloud.com.br,phishing,16,13,0,3,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,35,31,1,4,0,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,26,1,5,0,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,72,7,16,0,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,222,22,13,0,0
...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,39,33,12,6,0,0
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,44,36,7,8,0,1
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,42,36,3,6,0,1
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,39,0,6,0,0


In [51]:
v_c = data.groupby(["short", "type"]).count()
v_c["type"] = [v_c.index.values[i][1] for i in range(len(v_c))]
temp = v_c
fig = px.bar(temp, x=[v_c.index.values[i][0] for i in range(len(temp))], y="url", color = "type")
fig.show()

In [52]:
suspects_words = r"free|paid|bank|account|paypal|login|reset|password|bill|pay|card|assistance|service"

def has_suspect_word(url):
    if re.match(pattern=suspects_words, string=url):
        return 1
    else :
        return 0

In [53]:
data["suspects_words"] = data.url.apply(has_suspect_word)
data

Unnamed: 0,url,type,length,letters,numbers,n_special_char,https,short,suspects_words
0,br-icloud.com.br,phishing,16,13,0,3,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,35,31,1,4,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,26,1,5,0,0,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,72,7,16,0,0,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,222,22,13,0,0,0
...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,39,33,12,6,0,0,0
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,44,36,7,8,0,1,0
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,42,36,3,6,0,1,0
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,39,0,6,0,0,0


In [54]:
def use_ip(url):
    if urlparse(url).netloc:
        return 1 
    else:
        return 0

In [55]:
data["use_ip"] = data.url.apply(use_ip)
data

Unnamed: 0,url,type,length,letters,numbers,n_special_char,https,short,suspects_words,use_ip
0,br-icloud.com.br,phishing,16,13,0,3,0,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,35,31,1,4,0,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,26,1,5,0,0,0,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,72,7,16,0,0,0,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,222,22,13,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,39,33,12,6,0,0,0,0
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,44,36,7,8,0,1,0,0
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,42,36,3,6,0,1,0,0
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,39,0,6,0,0,0,0


In [56]:
def url_has_port_in_string(url):
    has_port = urlparse(url).netloc.split(':')
    if len(has_port) > 1 and has_port[-1].isdigit():
        return 1
    else:
        return 0

In [57]:
data["has_port"] = data.url.apply(url_has_port_in_string)
data

Unnamed: 0,url,type,length,letters,numbers,n_special_char,https,short,suspects_words,use_ip,has_port
0,br-icloud.com.br,phishing,16,13,0,3,0,0,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,35,31,1,4,0,0,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,26,1,5,0,0,0,0,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,72,7,16,0,0,0,1,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,222,22,13,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,39,33,12,6,0,0,0,0,0
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,44,36,7,8,0,1,0,0,0
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,42,36,3,6,0,1,0,0,0
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,39,0,6,0,0,0,0,0


In [58]:
def number_of_fragments(url):
    frags = urlparse(url).fragment
    return len(frags.split('#')) - 1 if frags == '' else 0

In [59]:
data["number_fragment"] = data.url.apply(number_of_fragments)
data

Unnamed: 0,url,type,length,letters,numbers,n_special_char,https,short,suspects_words,use_ip,has_port,number_fragment
0,br-icloud.com.br,phishing,16,13,0,3,0,0,0,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,35,31,1,4,0,0,0,0,0,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,26,1,5,0,0,0,0,0,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,72,7,16,0,0,0,1,0,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,222,22,13,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,39,33,12,6,0,0,0,0,0,0
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,44,36,7,8,0,1,0,0,0,0
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,42,36,3,6,0,1,0,0,0,0
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,39,0,6,0,0,0,0,0,0


In [60]:
from tld import get_tld, is_tld
def extract_pri_domain(url):
    try:
        res = get_tld(url, as_object = True, fail_silently=False,fix_protocol=True)
        pri_domain= res.parsed_url.netloc
    except :
        pri_domain= "None"
    return pri_domain

In [61]:
data["domain"] = data.url.apply(extract_pri_domain)
data

Unnamed: 0,url,type,length,letters,numbers,n_special_char,https,short,suspects_words,use_ip,has_port,number_fragment,domain
0,br-icloud.com.br,phishing,16,13,0,3,0,0,0,0,0,0,br-icloud.com.br
1,mp3raid.com/music/krizz_kaliko.html,benign,35,31,1,4,0,0,0,0,0,0,mp3raid.com
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,26,1,5,0,0,0,0,0,0,bopsecrets.org
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,72,7,16,0,0,0,1,0,0,www.garage-pirenne.be
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,222,22,13,0,0,0,1,0,0,adventure-nicaragua.net
...,...,...,...,...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,39,33,12,6,0,0,0,0,0,0,xbox360.ign.com
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,44,36,7,8,0,1,0,0,0,0,games.teamxbox.com
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,42,36,3,6,0,1,0,0,0,0,www.gamespot.com
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,39,0,6,0,0,0,0,0,0,en.wikipedia.org


In [62]:
def get_url_region(primary_domain):
    ccTLD_to_region = {
    ".ac": "Ascension Island",
    ".ad": "Andorra",
    ".ae": "United Arab Emirates",
    ".af": "Afghanistan",
    ".ag": "Antigua and Barbuda",
    ".ai": "Anguilla",
    ".al": "Albania",
    ".am": "Armenia",
    ".an": "Netherlands Antilles",
    ".ao": "Angola",
    ".aq": "Antarctica",
    ".ar": "Argentina",
    ".as": "American Samoa",
    ".at": "Austria",
    ".au": "Australia",
    ".aw": "Aruba",
    ".ax": "Åland Islands",
    ".az": "Azerbaijan",
    ".ba": "Bosnia and Herzegovina",
    ".bb": "Barbados",
    ".bd": "Bangladesh",
    ".be": "Belgium",
    ".bf": "Burkina Faso",
    ".bg": "Bulgaria",
    ".bh": "Bahrain",
    ".bi": "Burundi",
    ".bj": "Benin",
    ".bm": "Bermuda",
    ".bn": "Brunei Darussalam",
    ".bo": "Bolivia",
    ".br": "Brazil",
    ".bs": "Bahamas",
    ".bt": "Bhutan",
    ".bv": "Bouvet Island",
    ".bw": "Botswana",
    ".by": "Belarus",
    ".bz": "Belize",
    ".ca": "Canada",
    ".cc": "Cocos Islands",
    ".cd": "Democratic Republic of the Congo",
    ".cf": "Central African Republic",
    ".cg": "Republic of the Congo",
    ".ch": "Switzerland",
    ".ci": "Côte d'Ivoire",
    ".ck": "Cook Islands",
    ".cl": "Chile",
    ".cm": "Cameroon",
    ".cn": "China",
    ".co": "Colombia",
    ".cr": "Costa Rica",
    ".cu": "Cuba",
    ".cv": "Cape Verde",
    ".cw": "Curaçao",
    ".cx": "Christmas Island",
    ".cy": "Cyprus",
    ".cz": "Czech Republic",
    ".de": "Germany",
    ".dj": "Djibouti",
    ".dk": "Denmark",
    ".dm": "Dominica",
    ".do": "Dominican Republic",
    ".dz": "Algeria",
    ".ec": "Ecuador",
    ".ee": "Estonia",
    ".eg": "Egypt",
    ".er": "Eritrea",
    ".es": "Spain",
    ".et": "Ethiopia",
    ".eu": "European Union",
    ".fi": "Finland",
    ".fj": "Fiji",
    ".fk": "Falkland Islands",
    ".fm": "Federated States of Micronesia",
    ".fo": "Faroe Islands",
    ".fr": "France",
    ".ga": "Gabon",
    ".gb": "United Kingdom",
    ".gd": "Grenada",
    ".ge": "Georgia",
    ".gf": "French Guiana",
    ".gg": "Guernsey",
    ".gh": "Ghana",
    ".gi": "Gibraltar",
    ".gl": "Greenland",
    ".gm": "Gambia",
    ".gn": "Guinea",
    ".gp": "Guadeloupe",
    ".gq": "Equatorial Guinea",
    ".gr": "Greece",
    ".gs": "South Georgia and the South Sandwich Islands",
    ".gt": "Guatemala",
    ".gu": "Guam",
    ".gw": "Guinea-Bissau",
    ".gy": "Guyana",
    ".hk": "Hong Kong",
    ".hm": "Heard Island and McDonald Islands",
    ".hn": "Honduras",
    ".hr": "Croatia",
    ".ht": "Haiti",
    ".hu": "Hungary",
    ".id": "Indonesia",
    ".ie": "Ireland",
    ".il": "Israel",
    ".im": "Isle of Man",
    ".in": "India",
    ".io": "British Indian Ocean Territory",
    ".iq": "Iraq",
    ".ir": "Iran",
    ".is": "Iceland",
    ".it": "Italy",
    ".je": "Jersey",
    ".jm": "Jamaica",
    ".jo": "Jordan",
    ".jp": "Japan",
    ".ke": "Kenya",
    ".kg": "Kyrgyzstan",
    ".kh": "Cambodia",
    ".ki": "Kiribati",
    ".km": "Comoros",
    ".kn": "Saint Kitts and Nevis",
    ".kp": "Democratic People's Republic of Korea (North Korea)",
    ".kr": "Republic of Korea (South Korea)",
    ".kw": "Kuwait",
    ".ky": "Cayman Islands",
    ".kz": "Kazakhstan",
    ".la": "Laos",
    ".lb": "Lebanon",
    ".lc": "Saint Lucia",
    ".li": "Liechtenstein",
    ".lk": "Sri Lanka",
    ".lr": "Liberia",
    ".ls": "Lesotho",
    ".lt": "Lithuania",
    ".lu": "Luxembourg",
    ".lv": "Latvia",
    ".ly": "Libya",
    ".ma": "Morocco",
    ".mc": "Monaco",
    ".md": "Moldova",
    ".me": "Montenegro",
    ".mf": "Saint Martin (French part)",
    ".mg": "Madagascar",
    ".mh": "Marshall Islands",
    ".mk": "North Macedonia",
    ".ml": "Mali",
    ".mm": "Myanmar",
    ".mn": "Mongolia",
    ".mo": "Macao",
    ".mp": "Northern Mariana Islands",
    ".mq": "Martinique",
    ".mr": "Mauritania",
    ".ms": "Montserrat",
    ".mt": "Malta",
    ".mu": "Mauritius",
    ".mv": "Maldives",
    ".mw": "Malawi",
    ".mx": "Mexico",
    ".my": "Malaysia",
    ".mz": "Mozambique",
    ".na": "Namibia",
    ".nc": "New Caledonia",
    ".ne": "Niger",
    ".nf": "Norfolk Island",
    ".ng": "Nigeria",
    ".ni": "Nicaragua",
    ".nl": "Netherlands",
    ".no": "Norway",
    ".np": "Nepal",
    ".nr": "Nauru",
    ".nu": "Niue",
    ".nz": "New Zealand",
    ".om": "Oman",
    ".pa": "Panama",
    ".pe": "Peru",
    ".pf": "French Polynesia",
    ".pg": "Papua New Guinea",
    ".ph": "Philippines",
    ".pk": "Pakistan",
    ".pl": "Poland",
    ".pm": "Saint Pierre and Miquelon",
    ".pn": "Pitcairn",
    ".pr": "Puerto Rico",
    ".ps": "Palestinian Territory",
    ".pt": "Portugal",
    ".pw": "Palau",
    ".py": "Paraguay",
    ".qa": "Qatar",
    ".re": "Réunion",
    ".ro": "Romania",
    ".rs": "Serbia",
    ".ru": "Russia",
    ".rw": "Rwanda",
    ".sa": "Saudi Arabia",
    ".sb": "Solomon Islands",
    ".sc": "Seychelles",
    ".sd": "Sudan",
    ".se": "Sweden",
    ".sg": "Singapore",
    ".sh": "Saint Helena",
    ".si": "Slovenia",
    ".sj": "Svalbard and Jan Mayen",
    ".sk": "Slovakia",
    ".sl": "Sierra Leone",
    ".sm": "San Marino",
    ".sn": "Senegal",
    ".so": "Somalia",
    ".sr": "Suriname",
    ".ss": "South Sudan",
    ".st": "São Tomé and Príncipe",
    ".sv": "El Salvador",
    ".sx": "Sint Maarten (Dutch part)",
    ".sy": "Syria",
    ".sz": "Eswatini",
    ".tc": "Turks and Caicos Islands",
    ".td": "Chad",
    ".tf": "French Southern Territories",
    ".tg": "Togo",
    ".th": "Thailand",
    ".tj": "Tajikistan",
    ".tk": "Tokelau",
    ".tl": "Timor-Leste",
    ".tm": "Turkmenistan",
    ".tn": "Tunisia",
    ".to": "Tonga",
    ".tr": "Turkey",
    ".tt": "Trinidad and Tobago",
    ".tv": "Tuvalu",
    ".tw": "Taiwan",
    ".tz": "Tanzania",
    ".ua": "Ukraine",
    ".ug": "Uganda",
    ".uk": "United Kingdom",
    ".us": "United States",
    ".uy": "Uruguay",
    ".uz": "Uzbekistan",
    ".va": "Vatican City",
    ".vc": "Saint Vincent and the Grenadines",
    ".ve": "Venezuela",
    ".vg": "British Virgin Islands",
    ".vi": "U.S. Virgin Islands",
    ".vn": "Vietnam",
    ".vu": "Vanuatu",
    ".wf": "Wallis and Futuna",
    ".ws": "Samoa",
    ".ye": "Yemen",
    ".yt": "Mayotte",
    ".za": "South Africa",
    ".zm": "Zambia",
    ".zw": "Zimbabwe"
    }
    
    for ccTLD in ccTLD_to_region:
        if primary_domain.endswith(ccTLD):
            return ccTLD_to_region[ccTLD]
    
    return "Global"

In [63]:
data["loc"] = data.domain.apply(get_url_region)
data

Unnamed: 0,url,type,length,letters,numbers,n_special_char,https,short,suspects_words,use_ip,has_port,number_fragment,domain,loc
0,br-icloud.com.br,phishing,16,13,0,3,0,0,0,0,0,0,br-icloud.com.br,Brazil
1,mp3raid.com/music/krizz_kaliko.html,benign,35,31,1,4,0,0,0,0,0,0,mp3raid.com,Global
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,26,1,5,0,0,0,0,0,0,bopsecrets.org,Global
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,72,7,16,0,0,0,1,0,0,www.garage-pirenne.be,Belgium
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,222,22,13,0,0,0,1,0,0,adventure-nicaragua.net,Global
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,39,33,12,6,0,0,0,0,0,0,xbox360.ign.com,Global
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,44,36,7,8,0,1,0,0,0,0,games.teamxbox.com,Global
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,42,36,3,6,0,1,0,0,0,0,www.gamespot.com,Global
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,39,0,6,0,0,0,0,0,0,en.wikipedia.org,Global


In [64]:
data["loc"].value_counts()

Global            515626
Germany            13472
United Kingdom     11972
Canada             10851
Brazil              8758
                   ...  
Djibouti               1
Jordan                 1
Antarctica             1
Benin                  1
Cook Islands           1
Name: loc, Length: 201, dtype: int64

In [65]:
import plotly.graph_objects as go

# Count the occurrences of each region
region_counts = data["loc"].value_counts()

# Add a count for 'Global' if it doesn't exist in the DataFrame
region_counts['Global'] = 0

# Create a dataframe for the region counts
data2 = pd.DataFrame({'region': region_counts.index, 'count': region_counts.values})

# Define the colorscale for the choropleth
colorscale = 'YlOrRd'  # Change the colorscale to 'YlOrRd' for a more vibrant color scheme

# Create the Choropleth chart
fig = go.Figure(data=go.Choropleth(
    locations=data2['region'],
    z=data2['count'],
    locationmode='country names',
    colorscale=colorscale,
    autocolorscale=True,  # Set autocolorscale to True for vibrant colors
    marker_line_color='white',
    colorbar_title='Count'
))

# Set the chart title
fig.update_layout(title_text='Distribution of URL Regions')

# Display the chart
fig.show()

In [66]:
data["url_type"] = data["type"].replace({
    'benign':0,
    'defacement':1,
    'phishing':2,
    'malware':3
});
data

Unnamed: 0,url,type,length,letters,numbers,n_special_char,https,short,suspects_words,use_ip,has_port,number_fragment,domain,loc,url_type
0,br-icloud.com.br,phishing,16,13,0,3,0,0,0,0,0,0,br-icloud.com.br,Brazil,2
1,mp3raid.com/music/krizz_kaliko.html,benign,35,31,1,4,0,0,0,0,0,0,mp3raid.com,Global,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,26,1,5,0,0,0,0,0,0,bopsecrets.org,Global,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,72,7,16,0,0,0,1,0,0,www.garage-pirenne.be,Belgium,1
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,222,22,13,0,0,0,1,0,0,adventure-nicaragua.net,Global,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,39,33,12,6,0,0,0,0,0,0,xbox360.ign.com,Global,2
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,44,36,7,8,0,1,0,0,0,0,games.teamxbox.com,Global,2
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,42,36,3,6,0,1,0,0,0,0,www.gamespot.com,Global,2
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,39,0,6,0,0,0,0,0,0,en.wikipedia.org,Global,2


In [67]:
import tldextract
def extract_root_domain(url):
    extracted = tldextract.extract(url)
    root_domain = extracted.domain
    return root_domain

In [68]:
data["base_root"] = data.domain.apply(extract_root_domain)
data

Unnamed: 0,url,type,length,letters,numbers,n_special_char,https,short,suspects_words,use_ip,has_port,number_fragment,domain,loc,url_type,base_root
0,br-icloud.com.br,phishing,16,13,0,3,0,0,0,0,0,0,br-icloud.com.br,Brazil,2,br-icloud
1,mp3raid.com/music/krizz_kaliko.html,benign,35,31,1,4,0,0,0,0,0,0,mp3raid.com,Global,0,mp3raid
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,26,1,5,0,0,0,0,0,0,bopsecrets.org,Global,0,bopsecrets
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,72,7,16,0,0,0,1,0,0,www.garage-pirenne.be,Belgium,1,garage-pirenne
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,222,22,13,0,0,0,1,0,0,adventure-nicaragua.net,Global,1,adventure-nicaragua
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,39,33,12,6,0,0,0,0,0,0,xbox360.ign.com,Global,2,ign
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,44,36,7,8,0,1,0,0,0,0,games.teamxbox.com,Global,2,teamxbox
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,42,36,3,6,0,1,0,0,0,0,www.gamespot.com,Global,2,gamespot
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,39,0,6,0,0,0,0,0,0,en.wikipedia.org,Global,2,wikipedia


# Encoding

In [69]:
import hashlib
def encode_cat(cat):
    hash_object = hashlib.md5(cat.encode())
    return int(hash_object.hexdigest(), 16) % (10 ** 8)

In [70]:
data["loc_enc"] = data["loc"].apply(encode_cat)
data["base_root_enc"] = data.base_root.apply(encode_cat)
data

Unnamed: 0,url,type,length,letters,numbers,n_special_char,https,short,suspects_words,use_ip,has_port,number_fragment,domain,loc,url_type,base_root,loc_enc,base_root_enc
0,br-icloud.com.br,phishing,16,13,0,3,0,0,0,0,0,0,br-icloud.com.br,Brazil,2,br-icloud,27739261,1310791
1,mp3raid.com/music/krizz_kaliko.html,benign,35,31,1,4,0,0,0,0,0,0,mp3raid.com,Global,0,mp3raid,32604616,58335668
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,26,1,5,0,0,0,0,0,0,bopsecrets.org,Global,0,bopsecrets,32604616,28611805
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,72,7,16,0,0,0,1,0,0,www.garage-pirenne.be,Belgium,1,garage-pirenne,71484583,89045308
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,222,22,13,0,0,0,1,0,0,adventure-nicaragua.net,Global,1,adventure-nicaragua,32604616,76838614
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,39,33,12,6,0,0,0,0,0,0,xbox360.ign.com,Global,2,ign,32604616,47632402
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,44,36,7,8,0,1,0,0,0,0,games.teamxbox.com,Global,2,teamxbox,32604616,33842502
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,42,36,3,6,0,1,0,0,0,0,www.gamespot.com,Global,2,gamespot,32604616,61429360
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,45,39,0,6,0,0,0,0,0,0,en.wikipedia.org,Global,2,wikipedia,32604616,41540124


In [71]:
data.number_fragment.value_counts()

0    651191
Name: number_fragment, dtype: int64

In [72]:
data_total = data.drop(columns=["domain", "type", "url", "number_fragment"])
data_total

Unnamed: 0,length,letters,numbers,n_special_char,https,short,suspects_words,use_ip,has_port,loc,url_type,base_root,loc_enc,base_root_enc
0,16,13,0,3,0,0,0,0,0,Brazil,2,br-icloud,27739261,1310791
1,35,31,1,4,0,0,0,0,0,Global,0,mp3raid,32604616,58335668
2,31,26,1,5,0,0,0,0,0,Global,0,bopsecrets,32604616,28611805
3,88,72,7,16,0,0,0,1,0,Belgium,1,garage-pirenne,71484583,89045308
4,235,222,22,13,0,0,0,1,0,Global,1,adventure-nicaragua,32604616,76838614
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651186,39,33,12,6,0,0,0,0,0,Global,2,ign,32604616,47632402
651187,44,36,7,8,0,1,0,0,0,Global,2,teamxbox,32604616,33842502
651188,42,36,3,6,0,1,0,0,0,Global,2,gamespot,32604616,61429360
651189,45,39,0,6,0,0,0,0,0,Global,2,wikipedia,32604616,41540124


In [73]:
fig = px.imshow(data_total.corr(), text_auto=True)
fig.show()

# Models

In [74]:
X = data_total
y = data_total.pop("url_type")

In [75]:
X = X.drop(columns=["base_root", "loc"])

In [76]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.4, random_state=42
    )

In [80]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix

from sklearn.model_selection import train_test_split

from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

import plotly.express as px

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=42
)


models = {
    "Dummy": DummyClassifier(),
    "KNN": KNeighborsClassifier(),
    # "SVM" : SVC(verbose=True),
    "Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(verbose=True),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(verbose=True)

}

for name, model in models.items():

    print(name)
    print("training")
    model.fit(X_train, y_train)
    print("predict")
    prediction = model.predict(X_test)
    print("Done")
    recall = recall_score(y_test, prediction, average="macro")
    accuracy = accuracy_score(y_test, prediction)
    f1 = f1_score(y_test, prediction, average="macro")

    print(
        "Model trained, test scores : \n recall = {} \n accuracy = {} \n f1 = {}".format(
            recall, accuracy, f1)
    )

    fig = px.imshow(confusion_matrix(y_test, prediction), text_auto=True)
    fig.show()

Dummy
training
predict
Done
Model trained, test scores : 
 recall = 0.25 
 accuracy = 0.657724098480864 
 f1 = 0.19838165442717562


KNN
training
predict
Done
Model trained, test scores : 
 recall = 0.8371560318310104 
 accuracy = 0.8826499076693911 
 f1 = 0.8484809070667956


Tree
training
predict
Done
Model trained, test scores : 
 recall = 0.868604035795646 
 accuracy = 0.8989584493064646 
 f1 = 0.8674289045550366


Random Forest
training


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


predict


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    9.2s finished


Done
Model trained, test scores : 
 recall = 0.8632509781340718 
 accuracy = 0.9044138254049302 
 f1 = 0.8718599048146195


Naive Bayes
training
predict
Done
Model trained, test scores : 
 recall = 0.3030887749806817 
 accuracy = 0.6569831501437747 
 f1 = 0.27887109607555916


MLP
training
Iteration 1, loss = 18.16071490
Iteration 2, loss = 18.11492600
Iteration 3, loss = 18.12840062
Iteration 4, loss = 18.13829117
Iteration 5, loss = 18.16110040
Iteration 6, loss = 18.11224142
Iteration 7, loss = 18.10656248
Iteration 8, loss = 18.11811126
Iteration 9, loss = 18.13327065
Iteration 10, loss = 18.10641476
Iteration 11, loss = 18.05122922
Iteration 12, loss = 18.01273508
Iteration 13, loss = 18.01846518
Iteration 14, loss = 17.97731770
Iteration 15, loss = 17.96784756
Iteration 16, loss = 17.90199419
Iteration 17, loss = 17.93558763
Iteration 18, loss = 17.93384534
Iteration 19, loss = 17.92213515
Iteration 20, loss = 17.87301998
Iteration 21, loss = 17.83479601
Iteration 22, loss = 17.86543450
Iteration 23, loss = 17.94378945
Iteration 24, loss = 17.94763727
Iteration 25, loss = 17.97661714
Iteration 26, loss = 17.94306290
Iteration 27, loss = 18.00300824
Iteration 28, loss = 18.04617698
Iteration 29, loss = 18.00783270
Iteration 30, loss = 18.01493511
Iterat