In [None]:
'''
Author: Kfir Cohen

An implemention of the proposed algorithm in the paper: 
"Malicious URLs Detection Using Decision Tree Classifiers and Majority Voting Technique", by D. Patil and J. Patil

I only used url features, without web-content features.The model I implemented is Random Forrest.


Link to the paper: https://www.researchgate.net/publication/324014302_Malicious_URLs_Detection_Using_Decision_Tree_Classifiers_and_Majority_Voting_Technique

Link to the Dataset: https://research.aalto.fi/en/datasets/phishstorm-phishing-legitimate-url-dataset
'''

In [1]:
from urllib.parse import urlparse
import pandas as pd
from collections import Counter
import ipaddress as ip

### Features list (url features + suspicious words) acording to the referenced paper above 

In [2]:
# Features list
url_features = ["url_len", "query_len", "tokens_num", "dots_num", "hyphens_num",\
                   "underscore_num", "equal_num", "f_slash_num", "q_mark_num", "semicolon_num", "o_parenthesis_num", \
                   "c_parenthesis_num", "mod_num", "amp_num", "at_num", "digit_num","domain_ip_addr"]

suspicious_strings = ["secure", "account", "webscr", "login", "ebayisapi", "signin", "banking",\
                     "confirm", "blog", "logon", "signon", "login.asp", "login.php", "login.htm", \
                     ".exe", ".zip", ".rar", ".jpg", ".gif", "viewer.php", "link=", "getImage.asp", \
                     "plugins", "paypal", "order", "dbsys.php", "config.bin", "download.php", ".js", \
                     "payment", "files", "css", "shopping", "mail.php", ".jar", ".swf", ".cgi", ".php",\
                     "abuse", "admin", ".bin", "personal", "update", "verification"]

dataset_file = 'urls_dataset.csv'

### Feature extraction functions

In [3]:
# Get url length.
def get_url_len(url):
    return len(url)

# Get query length.
def get_query_len(url):
    query = urlparse(url).query
    return len(query)

#Count How many tokens are in the url.
def count_tokens(url):
    return(url.count('token='))

# Get an array with the count of each charachter that was specified as suspicious.
def get_charachters_count(url):
    counter = Counter(url) # Count charachters so we won't need to read the string multiple times.
    char_list = ['.','-','_','=','/','?',';','(',')','%','&','@']
    return [counter[c] for c in char_list]

# Count how many digits are in the url.
def count_digits(url):
    return sum(c.isdigit() for c in url)

# Check if the domain is an IP Adress.
def is_domain_ip(url):
    domain = urlparse(url).netloc
    try:
        if ip.ip_address(domain):
            return True
    except:
        return False

# Get a boolean list of the presence of the suspicious words in the url. 
def find_suspicious_words(url, suspicious_words):
    existing_list = []
    for word in suspicious_words:
        if word in url:
            existing_list.append(True)
        else:
            existing_list.append(False)
    return existing_list
            

In [4]:
# Extracting all the wanted features from the given url and appendinf the label.
def get_features(url, label):
    features = []
    
    #append the url features
    features.append(get_url_len(url))
    features.append(get_query_len(url))
    features.append(count_tokens(url))
    features += get_charachters_count(url)
    features.append(count_digits(url))
    features.append(is_domain_ip(url))

    #append the suspicious words features
    features += find_suspicious_words(url, suspicious_strings)
    
    # append the label
    features.append(label)
    
    return features
    

### Loading of the data and extracting of the features

In [5]:
%%time
# Loading data and shuffle
data_set = pd.read_csv(dataset_file)
data_set = data_set.sample(frac=1).reset_index(drop=True)
data_set = data_set[data_set['label'].notna()]

# extracting featuers from all the examples in the data set
features_list = [] # Appending features to a list and convert the list to DataFrame is much fater than appending directly to the dataFrame.
for i in range(len(data_set)):
    features = get_features(data_set["url"].loc[i], data_set["label"].loc[i])
    features_list.append(features)

#Creates the features set
features_list_df = pd.DataFrame(features_list)
features_set = pd.DataFrame(features_list,columns = (url_features + ["presence of " + s for s in suspicious_strings] + ['label']))

Wall time: 10.9 s


### Initializing and training the Random Forest model

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [7]:
%%time

# Spliting data to samples and labels
X = features_set.drop(['label'],axis=1).values
y = features_set['label'].values.astype('int')

# Spliting the data to test and train data sets
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.25, random_state=0)

# Initializing the random forest model and training it.
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)


Wall time: 17.2 s


RandomForestClassifier()

### Checking accuracy 

In [8]:
score = model.score(X_test,y_test)
print(score)

0.8939541583696987
