In [2]:
from typing import List, Union, Tuple, Any
import os
import csv
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

from tqdm import tqdm
import numpy as np
import pandas as pd

In [3]:
%%capture
# 'capture' silences output.
# The below cd is necessary to import local functions in parent directory
%cd ..

from read_data import read_phishing
from url_tokenizer import url_tokenizer, flatten_url_data
from featurizer import UrlFeaturizer, SAMPLE

In [4]:
URL_CLEANED = 'UrlCleaned'
URL_FEAT = 'UrlFeat'

VAL_PROP = 0.2

SEED = 42  # for reproducibility
np.random.seed(SEED)

In [5]:
feat = UrlFeaturizer(SAMPLE, verbose=False)

In [6]:
phishing = read_phishing().sample(frac=1, random_state=SEED)
phishing

Unnamed: 0,idx,url,label
9095,19060,http://codecanyon.net/item/responsive-flipbook...,benign
21993,31958,http://tobogo.net/cdsb/board.php?board=storyan...,benign
14851,24816,http://thenextweb.com/apps/2011/07/24/the-comp...,benign
6092,16057,http://1337x.to/torrent/1160230/Learn-Wordpres...,benign
28254,38219,http://torcache.net/torrent/EF4AECEE43C5FA26E8...,benign
...,...,...,...
1321,11286,http://hubpages.com/topics/business-and-employ...,benign
34769,44734,https://twitter.com/home?status=%E3%83%8C%E3%8...,benign
28195,38160,https://medium.com/keep-learning-keep-growing/...,benign
860,860,http://florimat.com/sphere3d/webscr.php?cmd=_l...,phishing


In [7]:
phishing.groupby(['label']).size()

label
benign      35378
phishing     9963
dtype: int64

In [8]:
urls = phishing['url'].to_numpy()
urls

array(['http://codecanyon.net/item/responsive-flipbook-wordpress-plugin/full_screen_preview/2372863',
       'http://tobogo.net/cdsb/board.php?board=storyani&bm=view&no=77&category=&auth=&page=1&search=&keyword=&recom=',
       'http://thenextweb.com/apps/2011/07/24/the-complete-list-of-top-instagram-apps/pictarine/gtm.start',
       ...,
       'https://medium.com/keep-learning-keep-growing/how-do-i-know-if-i-should-take-a-job-at-a-startup-9e81c7a182af?source=top-stories',
       'http://florimat.com/sphere3d/webscr.php?cmd=_login-run&dispatch=5885d80a13c0db1f1ff80d546411d7f84f1036d8f209d3d19ebb6f4eeec8bd0ea2c981a1ba041f14509654c0a1ca8fefa2c981a1ba041f14509654c0a1ca8fef',
       'http://sfglobe.com/2015/05/04/a-radio-show-surprises-this-mom-after-he-son-passed-away/'],
      dtype=object)

In [9]:
urls_tokenized = []
urls_feat = []

for url in tqdm(urls):
    try:
        vec, _ = feat.featurize(url)
        urls_feat.append(vec)
        
        url_data = url_tokenizer(url)
        words = flatten_url_data(url_data)
        urls_tokenized.append(' '.join(words))
    except:
        urls_tokenized.append('error')
        
print(urls_tokenized[5])
print(len(urls_tokenized))

100%|██████████| 45341/45341 [00:43<00:00, 1034.69it/s]

http ki en th uc net vn diem thi diem chuan dai hoc cong ngh i ep ha noi nam 2014 482421 html
45341





In [10]:
urls_feat = np.array(urls_feat)
urls_feat.shape

(45341, 11)

In [11]:
NUM_FEATS = urls_feat.shape[1]

phishing[URL_CLEANED] = urls_tokenized
for i in range(NUM_FEATS):
    phishing[f'{URL_FEAT}_{i}'] = urls_feat[:, i]
phishing

Unnamed: 0,idx,url,label,UrlCleaned,UrlFeat_0,UrlFeat_1,UrlFeat_2,UrlFeat_3,UrlFeat_4,UrlFeat_5,UrlFeat_6,UrlFeat_7,UrlFeat_8,UrlFeat_9,UrlFeat_10
9095,19060,http://codecanyon.net/item/responsive-flipbook...,benign,http code canyon net item responsive flip book...,0,2,0,0,0,10,1,0,7,0,7
21993,31958,http://tobogo.net/cdsb/board.php?board=storyan...,benign,http to bogo net cdsb board php board story an...,0,2,0,0,0,3,1,0,0,3,3
14851,24816,http://thenextweb.com/apps/2011/07/24/the-comp...,benign,http the next web com apps 2011 07 24 the comp...,0,3,0,0,0,19,1,0,8,0,8
6092,16057,http://1337x.to/torrent/1160230/Learn-Wordpres...,benign,http 1337 x to torrent 1160230 learn wordpress...,0,2,0,0,0,12,0,0,7,0,7
28254,38219,http://torcache.net/torrent/EF4AECEE43C5FA26E8...,benign,http tor cache net torrent ef 4 a e cee 43 c 5...,0,2,0,0,0,23,1,0,22,0,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1321,11286,http://hubpages.com/topics/business-and-employ...,benign,http hub pages com topics business and employm...,0,2,0,0,0,8,1,0,3,0,3
34769,44734,https://twitter.com/home?status=%E3%83%8C%E3%8...,benign,https twitter com home status,1,1,0,0,0,1,1,0,0,0,0
28195,38160,https://medium.com/keep-learning-keep-growing/...,benign,https medium com keep learning keep growing ho...,1,1,0,0,0,25,1,0,7,0,7
860,860,http://florimat.com/sphere3d/webscr.php?cmd=_l...,phishing,http flor im at com sphere 3 d webs cr php cmd...,0,3,0,0,0,6,1,0,1,75,76


In [12]:
extra_features = [f'{URL_FEAT}_{i}' for i in range(NUM_FEATS)]

In [13]:
def transform_func(x):
    return np.log1p(x + 1)


def fit_transform_nb(df):
    vectorizer_0 = TfidfVectorizer(
        analyzer='word',
        strip_accents='unicode'
    )
    column_trans = ColumnTransformer([
        ('tfidf_word', vectorizer_0, URL_CLEANED),
        #('feat', FunctionTransformer(transform_func), extra_features)
    ])
    column_trans.fit(df)
    mat = column_trans.transform(df)
    return mat, column_trans

In [14]:
X, y = phishing[[URL_CLEANED, *extra_features]], phishing['label']

X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=VAL_PROP, random_state=SEED)


X_train_pre, trans = fit_transform_nb(X_train)

In [15]:
X_train_pre

<36272x42148 sparse matrix of type '<class 'numpy.float64'>'
	with 584520 stored elements in Compressed Sparse Row format>

# Naive Bayes

### Train the model

In [16]:
alpha = 0.6

model = MultinomialNB(
    alpha=alpha,
    fit_prior=False,
).fit(X_train_pre, y_train)

### Test on train data

In [17]:
preds_train = model.predict(X_train_pre)

f1_train = f1_score(y_train, preds_train, average='macro')
acc_train = accuracy_score(y_train, preds_train)
print(f'F1  = {f1_train}\nAcc = {acc_train}')

F1  = 0.9888051096008137
Acc = 0.9922805469783855


### Test on val data

In [18]:
X_val_pre = trans.transform(X_val)
preds_val = model.predict(X_val_pre)

f1_val_nb = f1_score(y_val, preds_val, average='macro')
acc_val_nb = accuracy_score(y_val, preds_val)
print(f'F1  = {f1_val_nb}\nAcc = {acc_val_nb}')

F1  = 0.9821190745484847
Acc = 0.9877605028117764


# Logistic Regression

In [19]:
def id_func(x):
    return x


def fit_transform_lr(df):
    vectorizer_0 = TfidfVectorizer(
        analyzer='word',
        strip_accents='unicode'
    )
    column_trans = ColumnTransformer([
        ('tfidf_word', vectorizer_0, URL_CLEANED),
        ('feat', FunctionTransformer(id_func), extra_features)
    ])
    column_trans.fit(df)
    mat = column_trans.transform(df)
    return mat, column_trans

In [20]:
X, y = phishing[[URL_CLEANED, *extra_features]], phishing['label']

X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=VAL_PROP, random_state=SEED)


X_train_pre, trans = fit_transform_lr(X_train)

### Train the model

In [21]:
C = 0.8

model = LogisticRegression(
    class_weight='balanced', C=C, max_iter=1200
).fit(X_train_pre, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Test on train data

In [22]:
preds_train = model.predict(X_train_pre)

f1_train = f1_score(y_train, preds_train, average='macro')
acc_train = accuracy_score(y_train, preds_train)
print(f'F1  = {f1_train}\nAcc = {acc_train}')

F1  = 0.9954476412182975
Acc = 0.9968846493162771


### Test on val data

In [23]:
X_val_pre = trans.transform(X_val)
preds_val = model.predict(X_val_pre)

f1_val_lr = f1_score(y_val, preds_val, average='macro')
acc_val_lr = accuracy_score(y_val, preds_val)
print(f'F1  = {f1_val_lr}\nAcc = {acc_val_lr}')

F1  = 0.9902692436998835
Acc = 0.9933840555739332


# Support Vector Machine

In [24]:
def id_func(x):
    return x


def fit_transform_svm(df):
    vectorizer_0 = TfidfVectorizer(
        analyzer='word',
        strip_accents='unicode'
    )
    column_trans = ColumnTransformer([
        ('tfidf_word', vectorizer_0, URL_CLEANED),
        ('feat', FunctionTransformer(id_func), extra_features)
    ])
    column_trans.fit(df)
    mat = column_trans.transform(df)
    return mat, column_trans

In [25]:
X, y = phishing[[URL_CLEANED, *extra_features]], phishing['label']

X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=VAL_PROP, random_state=SEED)


X_train_pre, trans = fit_transform_svm(X_train)

### Train the model

In [26]:
model = svm.LinearSVC(
    random_state=SEED, max_iter=5000
).fit(X_train_pre, y_train)



### Test on train data

In [27]:
preds_train = model.predict(X_train_pre)

f1_train = f1_score(y_train, preds_train, average='macro')
acc_train = accuracy_score(y_train, preds_train)
print(f'F1  = {f1_train}\nAcc = {acc_train}')

F1  = 0.9997589681796709
Acc = 0.9998345831495369


### Test on val data

In [28]:
X_val_pre = trans.transform(X_val)
preds_val = model.predict(X_val_pre)

f1_val_svm = f1_score(y_val, preds_val, average='macro')
acc_val_svm = accuracy_score(y_val, preds_val)
print(f'F1  = {f1_val_svm}\nAcc = {acc_val_svm}')

F1  = 0.994986223836815
Acc = 0.9965817620465321


# Write the data to disk

In [42]:
accs = {
    'NB': {'F1': f1_val_nb, 'Val Acc': acc_val_nb},
    'LR': {'F1': f1_val_lr, 'Val Acc': acc_val_lr},
    'SVM': {'F1': f1_val_svm, 'Val Acc': acc_val_svm}
}

with open('baselines/results/phishing.json', 'w') as f:
    f.write(json.dumps(accs))