In [1]:
from typing import List, Union, Tuple, Any
import os
import csv
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

from tqdm import tqdm
import numpy as np
import pandas as pd

In [2]:
import pickle

with open('phishing_extra.pkl', 'rb') as f:
    extra = pickle.load(f)

In [3]:
%%capture
# 'capture' silences output.
# The below cd is necessary to import local functions in parent directory
%cd ..

from read_data import read_phishing
from url_tokenizer import url_tokenizer, flatten_url_data
from featurizer import UrlFeaturizer, SAMPLE

In [4]:
URL_CLEANED = 'UrlCleaned'
URL_FEAT = 'UrlFeat'

VAL_PROP = 0.2

SEED = 42  # for reproducibility
np.random.seed(SEED)

In [5]:
feat = UrlFeaturizer(SAMPLE, verbose=False)

In [10]:
phishing = read_phishing().sample(frac=1, random_state=SEED)
phishing

Unnamed: 0,idx,url,label
9095,19060,http://codecanyon.net/item/responsive-flipbook...,benign
21993,31958,http://tobogo.net/cdsb/board.php?board=storyan...,benign
14851,24816,http://thenextweb.com/apps/2011/07/24/the-comp...,benign
6092,16057,http://1337x.to/torrent/1160230/Learn-Wordpres...,benign
28254,38219,http://torcache.net/torrent/EF4AECEE43C5FA26E8...,benign
...,...,...,...
1321,11286,http://hubpages.com/topics/business-and-employ...,benign
34769,44734,https://twitter.com/home?status=%E3%83%8C%E3%8...,benign
28195,38160,https://medium.com/keep-learning-keep-growing/...,benign
860,860,http://florimat.com/sphere3d/webscr.php?cmd=_l...,phishing


In [11]:
extra

Unnamed: 0,idx,url,label
0,0,https://68d1afd79d3880800.temporary.link/DZFE6...,phishing
1,1,http://bit.do//track-package,phishing
2,2,https://www.amazon-check-co-jp.n2k.top/?ord8hf...,phishing
3,3,http://docomonjo.com,phishing
4,4,http://stripcoach.ml/ncv/login.php?cmd=login_s...,phishing
...,...,...,...
11696,11696,http://gkjx168.com/images,phishing
11697,11697,http://www.habbocreditosparati.blogspot.com/,phishing
11698,11698,http://creditiperhabbogratissicuro100.blogspot...,phishing
11699,11699,http://mundovirtualhabbo.blogspot.com/2009_01_...,phishing


In [12]:
phishing = pd.concat([phishing, extra])
phishing

Unnamed: 0,idx,url,label
9095,19060,http://codecanyon.net/item/responsive-flipbook...,benign
21993,31958,http://tobogo.net/cdsb/board.php?board=storyan...,benign
14851,24816,http://thenextweb.com/apps/2011/07/24/the-comp...,benign
6092,16057,http://1337x.to/torrent/1160230/Learn-Wordpres...,benign
28254,38219,http://torcache.net/torrent/EF4AECEE43C5FA26E8...,benign
...,...,...,...
11696,11696,http://gkjx168.com/images,phishing
11697,11697,http://www.habbocreditosparati.blogspot.com/,phishing
11698,11698,http://creditiperhabbogratissicuro100.blogspot...,phishing
11699,11699,http://mundovirtualhabbo.blogspot.com/2009_01_...,phishing


In [13]:
phishing.groupby(['label']).size()

label
benign      35378
phishing    21664
dtype: int64

In [14]:
urls = phishing['url'].to_numpy()
urls

array(['http://codecanyon.net/item/responsive-flipbook-wordpress-plugin/full_screen_preview/2372863',
       'http://tobogo.net/cdsb/board.php?board=storyani&bm=view&no=77&category=&auth=&page=1&search=&keyword=&recom=',
       'http://thenextweb.com/apps/2011/07/24/the-complete-list-of-top-instagram-apps/pictarine/gtm.start',
       ...,
       'http://creditiperhabbogratissicuro100.blogspot.com/2011/02/habbo-crediti-gratis-sicuro-100.html',
       'http://mundovirtualhabbo.blogspot.com/2009_01_01_archive.html',
       'http://aijcs.blogspot.com/2005/03/colourful-life-of-aij.html'],
      dtype=object)

In [15]:
urls_tokenized = []
urls_feat = []

for url in tqdm(urls):
    try:
        vec, _ = feat.featurize(url)
        urls_feat.append(vec)
        
        url_data = url_tokenizer(url)
        words = flatten_url_data(url_data)
        urls_tokenized.append(' '.join(words))
    except:
        urls_tokenized.append('error')
        
print(urls_tokenized[5])
print(len(urls_tokenized))

100%|██████████| 57042/57042 [00:53<00:00, 1065.98it/s]

http ki en th uc net vn diem thi diem chuan dai hoc cong ngh i ep ha noi nam 2014 482421 html
57042





In [16]:
urls_feat = np.array(urls_feat)
urls_feat.shape

(57042, 20)

In [17]:
NUM_FEATS = urls_feat.shape[1]

phishing[URL_CLEANED] = urls_tokenized
for i in range(NUM_FEATS):
    phishing[f'{URL_FEAT}_{i}'] = urls_feat[:, i]
phishing

Unnamed: 0,idx,url,label,UrlCleaned,UrlFeat_0,UrlFeat_1,UrlFeat_2,UrlFeat_3,UrlFeat_4,UrlFeat_5,...,UrlFeat_10,UrlFeat_11,UrlFeat_12,UrlFeat_13,UrlFeat_14,UrlFeat_15,UrlFeat_16,UrlFeat_17,UrlFeat_18,UrlFeat_19
9095,19060,http://codecanyon.net/item/responsive-flipbook...,benign,http code canyon net item responsive flip book...,0,2,0,0,0,10,...,7,0,14,14,70,0,0,0,0,0
21993,31958,http://tobogo.net/cdsb/board.php?board=storyan...,benign,http to bogo net cdsb board php board story an...,0,2,0,0,0,3,...,3,0,22,10,15,75,1,0,0,0
14851,24816,http://thenextweb.com/apps/2011/07/24/the-comp...,benign,http the next web com apps 2011 07 24 the comp...,0,3,0,0,0,19,...,8,0,24,14,76,0,1,0,0,0
6092,16057,http://1337x.to/torrent/1160230/Learn-Wordpres...,benign,http 1337 x to torrent 1160230 learn wordpress...,0,2,0,0,0,12,...,7,0,16,8,73,0,0,12,0,0
28254,38219,http://torcache.net/torrent/EF4AECEE43C5FA26E8...,benign,http tor cache net torrent ef 4 a e cee 43 c 5...,0,2,0,0,0,23,...,22,0,28,12,57,6,1,18,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11696,11696,http://gkjx168.com/images,phishing,http g kj x 168 com images,0,4,0,0,0,1,...,0,0,7,11,7,0,0,0,0,0
11697,11697,http://www.habbocreditosparati.blogspot.com/,phishing,http www habbo credit os para ti blogspot com,0,1,6,1,0,0,...,0,0,9,36,1,0,0,0,0,0
11698,11698,http://creditiperhabbogratissicuro100.blogspot...,phishing,http credit i per habbo gratis sic uro 100 blo...,0,1,8,0,0,10,...,12,0,21,43,45,0,1,0,0,0
11699,11699,http://mundovirtualhabbo.blogspot.com/2009_01_...,phishing,http mundo virtual habbo blogspot com 2009 01 ...,0,1,3,0,0,5,...,8,0,11,30,24,0,1,0,0,0


In [18]:
extra_features = [f'{URL_FEAT}_{i}' for i in range(NUM_FEATS)]

In [19]:
def transform_func(x):
    return np.log1p(x + 1)


def fit_transform_nb(df):
    vectorizer_0 = TfidfVectorizer(
        analyzer='word',
        strip_accents='unicode'
    )
    column_trans = ColumnTransformer([
        ('tfidf_word', vectorizer_0, URL_CLEANED),
        #('feat', FunctionTransformer(transform_func), extra_features)
    ])
    column_trans.fit(df)
    mat = column_trans.transform(df)
    return mat, column_trans

In [20]:
X, y = phishing[[URL_CLEANED, *extra_features]], phishing['label']

X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=VAL_PROP, random_state=SEED)


X_train_pre, trans = fit_transform_nb(X_train)

In [21]:
X_train_pre

<45633x46911 sparse matrix of type '<class 'numpy.float64'>'
	with 723265 stored elements in Compressed Sparse Row format>

In [22]:
def error_score(y_val, preds_val):
    n_wrong = np.sum(y_val != preds_val)
    return n_wrong / len(y_val)

# Logistic Regression

In [23]:
def id_func(x):
    return x


def fit_transform_lr(df):
    vectorizer_0 = TfidfVectorizer(
        analyzer='word',
        strip_accents='unicode'
    )
    column_trans = ColumnTransformer([
        ('tfidf_word', vectorizer_0, URL_CLEANED),
        ('feat', FunctionTransformer(id_func), extra_features)
    ])
    column_trans.fit(df)
    mat = column_trans.transform(df)
    return mat, column_trans

In [24]:
X, y = phishing[[URL_CLEANED, *extra_features]], phishing['label']

X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=VAL_PROP, random_state=SEED)


X_train_pre, trans = fit_transform_lr(X_train)

### Train the model

In [28]:
C = 0.8

model = LogisticRegression(
    class_weight='balanced', C=C, max_iter=5000
).fit(X_train_pre, y_train)

### Test on train data

In [29]:
preds_train = model.predict(X_train_pre)

f1_train = f1_score(y_train, preds_train, average='macro')
acc_train = accuracy_score(y_train, preds_train)
err_score_train = error_score(y_train, preds_train)
print(f'F1  = {f1_train}\nAcc = {acc_train}\nErr = {err_score_train}')

F1  = 0.9929674525397099
Acc = 0.9933819823373435
Err = 0.006618017662656411


# Write the data to disk

In [29]:
accs = {
    'LR': {'F1': f1_val_lr, 'Val Acc': acc_val_lr, 'Error': err_score_lr}
}

with open('baselines/results/phishing.json', 'w') as f:
    f.write(json.dumps(accs))