# Baselines


# Constants

Here, you can alter the dataset, maximum number of samples to use, etc.

In [1]:
DMOZ, ILP, PHISHING = 'dmoz', 'ilp', 'phishing'

SEED = 42

# Setup


In [2]:
GITHUB_TOKEN = 'fe2e680f071553cddb5f698cc58373a5106380d4'
command = f'git clone --depth 1 https://{GITHUB_TOKEN}@github.com/shmulvad/nlp-project.git'
!{command}

%cd nlp-project/src

fatal: destination path 'nlp-project' already exists and is not an empty directory.
/content/nlp-project/src


In [3]:
!pip install -r requirements.txt



In [4]:
# DMOZ, ILP and original phishing dataset - datasets.pkl
!gdown --id 1WV1JSevCnaWY0-mqQMmtOEFSC3Y_Qdg_

Downloading...
From: https://drive.google.com/uc?id=1WV1JSevCnaWY0-mqQMmtOEFSC3Y_Qdg_
To: /content/nlp-project/src/datasets.pkl
99.4MB [00:00, 240MB/s]


In [5]:
import os
import shutil
import random
import pickle
import gc
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,classification_report
from sklearn.model_selection import train_test_split

from pprint import pprint
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from url_tokenizer import url_tokenizer, flatten_url_data
np.random.seed(SEED)



In [6]:
with open('datasets.pkl', 'rb') as f:
  datasets = pickle.load(f)

# Preparing the Dataframe

In [9]:
def get_clean_url(url):
  return ' '.join(flatten_url_data(url_tokenizer(url)))


url_regex = re.compile(r'''
        (https?):\/\/                                   # http s
        ([-a-zA-Z0-9@:%._\+~#=]+\.[a-zA-Z0-9()]{1,12})  # domains
        \b
        ([-a-zA-Z0-9()@:%_\+;.~#&//=]*)                 # path
        \??
        ([-a-zA-Z0-9()@:%_\+;.~#&//=?]*)                # args
    ''', re.DOTALL | re.VERBOSE)

def valid_url(url):
  match = url_regex.match(url.lower())
  if match:
    return True
  else:
    return False


def prepare_df(dataset,DATASET,PHISHING_EXTRA = False):
  if DATASET == DMOZ:
    df = dataset.sample(n = 100000)
  else:
    df = dataset.copy()

  if DATASET == PHISHING and PHISHING_EXTRA:
    phishing_extra = pd.read_csv("http://data.phishtank.com/data/online-valid.csv")
    phishing_extra = phishing_extra[['phish_id','url']]
    phishing_extra['label'] = phishing_extra.phish_id.apply(lambda x:'phishing')
    phishing_extra.columns = df.columns
    df = pd.concat([df,phishing_extra])

  df.reset_index(drop = True, inplace = True)
  df['valid'] = df.url.apply(lambda x:valid_url(x))
  df = df[df['valid']==True]
  df.drop(['valid'],axis=1,inplace=True)
  
  le = LabelEncoder()
  labels = le.fit_transform(df.label)
  df['clean_url'] = df.url.apply(lambda x:get_clean_url(x))

  vectorizer = CountVectorizer(max_features = 5000)
  X =vectorizer.fit_transform(df['clean_url'])
  df = None
  X = X.toarray()

  return X,labels

# Testing the generated Features

## Phishing

In [10]:
DATASET = PHISHING
PHISHING_EXTRA = False   #Set to true if you want to include phishtank data 

dataset = datasets[DATASET]
X,labels = prepare_df(dataset,DATASET)

In [11]:
X_train,X_test,Y_train,Y_test=train_test_split(X,labels,test_size=0.2)

### Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train,Y_train)
Y_RF_pred=rf.predict(X_test)
print(classification_report(Y_test,Y_RF_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      7051
           1       0.99      0.97      0.98      2018

    accuracy                           0.99      9069
   macro avg       0.99      0.98      0.99      9069
weighted avg       0.99      0.99      0.99      9069



### Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(X_train,Y_train)
Y_LR_pred=lr.predict(X_test)
print(classification_report(Y_test,Y_LR_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7051
           1       0.99      0.98      0.98      2018

    accuracy                           0.99      9069
   macro avg       0.99      0.99      0.99      9069
weighted avg       0.99      0.99      0.99      9069



### Naive Bayes

In [None]:
nb = MultinomialNB()
nb.fit(X_train,Y_train)
Y_NB_pred=nb.predict(X_test)
print(classification_report(Y_test,Y_NB_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7051
           1       0.97      0.96      0.96      2018

    accuracy                           0.98      9069
   macro avg       0.98      0.97      0.98      9069
weighted avg       0.98      0.98      0.98      9069



## Phishing with Phistank

In [18]:
DATASET = PHISHING
PHISHING_EXTRA = True   #Set to true if you want to include phishtank data 

dataset = datasets[DATASET]
X,labels = prepare_df(dataset,DATASET,PHISHING_EXTRA)

In [37]:
X_train,X_test,Y_train,Y_test=train_test_split(X,labels,test_size=0.2)

### Random Forest

In [38]:
rf = RandomForestClassifier()
rf.fit(X_train,Y_train)
Y_RF_pred=rf.predict(X_test)
print(classification_report(Y_test,Y_RF_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      7073
           1       0.99      0.99      0.99      4021

    accuracy                           0.99     11094
   macro avg       0.99      0.99      0.99     11094
weighted avg       0.99      0.99      0.99     11094



In [51]:
ph_e_rf_f1 = f1_score(Y_test,Y_RF_pred,average='weighted')

In [52]:
ph_e_f1

0.9942332711033935

### Logistic Regression

In [39]:
lr = LogisticRegression()
lr.fit(X_train,Y_train)
Y_LR_pred=lr.predict(X_test)
print(classification_report(Y_test,Y_LR_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      7073
           1       0.99      0.99      0.99      4021

    accuracy                           0.99     11094
   macro avg       0.99      0.99      0.99     11094
weighted avg       0.99      0.99      0.99     11094



In [57]:
ph_e_lr_f1 = f1_score(Y_test,Y_LR_pred,average='weighted')

In [58]:
ph_e_lr_f1

0.9926050234223613

### Naive Bayes

In [40]:
nb = MultinomialNB()
nb.fit(X_train,Y_train)
Y_NB_pred=nb.predict(X_test)
print(classification_report(Y_test,Y_NB_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      7073
           1       0.98      0.94      0.96      4021

    accuracy                           0.97     11094
   macro avg       0.97      0.97      0.97     11094
weighted avg       0.97      0.97      0.97     11094



In [59]:
ph_e_nb_f1 = f1_score(Y_test,Y_NB_pred,average='weighted')

In [60]:
ph_e_nb_f1

0.9708690055984058

## DMOZ

In [20]:
DATASET = DMOZ

dataset = datasets[DATASET]
X,labels = prepare_df(dataset,DATASET)

In [13]:
X_train,X_test,Y_train,Y_test=train_test_split(X,labels,test_size=0.2)

### Random Forest

In [15]:
rf = RandomForestClassifier()
rf.fit(X_train,Y_train)
Y_RF_pred=rf.predict(X_test)
print(classification_report(Y_test,Y_RF_pred))

              precision    recall  f1-score   support

           0       0.62      0.46      0.53       456
           1       0.48      0.54      0.51      3228
           2       0.41      0.60      0.49      3032
           3       0.40      0.32      0.36      1461
           4       0.59      0.47      0.52       735
           5       0.42      0.27      0.33       773
           6       0.58      0.40      0.47       358
           7       0.37      0.21      0.27       564
           8       0.35      0.18      0.23       114
           9       0.35      0.25      0.29      1377
          10       0.45      0.43      0.44       749
          11       0.47      0.42      0.44      1459
          12       0.30      0.22      0.26      1296
          13       0.45      0.54      0.49      3118
          14       0.63      0.50      0.56      1280

    accuracy                           0.45     20000
   macro avg       0.46      0.39      0.41     20000
weighted avg       0.45   

In [16]:
dmoz_rf_f1 = f1_score(Y_test,Y_RF_pred,average='weighted')

In [17]:
dmoz_rf_f1

0.44085661314455726

In [18]:
gc.collect()

195

### Logistic Regression

In [18]:
lr = LogisticRegression()
lr.fit(X_train,Y_train)
Y_LR_pred=lr.predict(X_test)
print(classification_report(Y_test,Y_LR_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.79      0.47      0.59       456
           1       0.52      0.60      0.56      3228
           2       0.40      0.74      0.52      3032
           3       0.49      0.35      0.41      1461
           4       0.75      0.48      0.58       735
           5       0.63      0.30      0.41       773
           6       0.88      0.40      0.55       358
           7       0.45      0.24      0.31       564
           8       0.50      0.14      0.22       114
           9       0.48      0.32      0.39      1377
          10       0.49      0.45      0.47       749
          11       0.60      0.43      0.50      1459
          12       0.43      0.25      0.31      1296
          13       0.51      0.60      0.55      3118
          14       0.74      0.54      0.62      1280

    accuracy                           0.50     20000
   macro avg       0.58      0.42      0.47     20000
weighted avg       0.53   

In [19]:
dmoz_lr_f1 = f1_score(Y_test,Y_LR_pred,average='weighted')

In [20]:
dmoz_lr_f1

0.49568321349876643

In [21]:
gc.collect()

176

### Naive Bayes

In [14]:
nb = MultinomialNB()
nb.fit(X_train,Y_train)
Y_NB_pred=nb.predict(X_test)
print(classification_report(Y_test,Y_NB_pred))

              precision    recall  f1-score   support

           0       0.79      0.46      0.58       456
           1       0.51      0.56      0.53      3228
           2       0.40      0.74      0.51      3032
           3       0.49      0.33      0.39      1461
           4       0.72      0.50      0.59       735
           5       0.61      0.30      0.41       773
           6       0.86      0.39      0.54       358
           7       0.50      0.19      0.27       564
           8       0.33      0.03      0.05       114
           9       0.42      0.35      0.38      1377
          10       0.43      0.46      0.45       749
          11       0.54      0.42      0.47      1459
          12       0.38      0.27      0.32      1296
          13       0.51      0.55      0.53      3118
          14       0.70      0.49      0.58      1280

    accuracy                           0.49     20000
   macro avg       0.55      0.40      0.44     20000
weighted avg       0.51   

In [15]:
dmoz_nb_f1 = f1_score(Y_test,Y_NB_pred,average='weighted')

In [16]:
dmoz_nb_f1

0.47854398557642014

In [17]:
gc.collect()

53

## ILP

In [22]:
DATASET = ILP

dataset = datasets[DATASET]
X,labels = prepare_df(dataset,DATASET)

In [36]:
X_train,X_test,Y_train,Y_test=train_test_split(X,labels,test_size=0.2)

### Random Forest

In [37]:
rf = RandomForestClassifier()
rf.fit(X_train,Y_train)
Y_RF_pred=rf.predict(X_test)
print(classification_report(Y_test,Y_RF_pred))

              precision    recall  f1-score   support

           0       0.75      0.75      0.75       186
           1       0.29      0.49      0.37        39
           2       0.72      0.61      0.66       213
           3       0.85      0.82      0.83       737
           4       0.55      0.46      0.51        99
           5       0.40      0.15      0.22        39
           6       0.70      0.86      0.77       333

    accuracy                           0.75      1646
   macro avg       0.61      0.59      0.59      1646
weighted avg       0.75      0.75      0.74      1646



In [38]:
ilp_rf_f1 = f1_score(Y_test,Y_RF_pred,average='weighted')

In [43]:
ilp_rf_f1

0.7427429488024888

### Logistic Regression

In [39]:
lr = LogisticRegression()
lr.fit(X_train,Y_train)
Y_LR_pred=lr.predict(X_test)
print(classification_report(Y_test,Y_LR_pred))

              precision    recall  f1-score   support

           0       0.82      0.75      0.78       186
           1       0.54      0.18      0.27        39
           2       0.67      0.64      0.65       213
           3       0.79      0.88      0.83       737
           4       0.58      0.38      0.46        99
           5       0.83      0.13      0.22        39
           6       0.69      0.77      0.72       333

    accuracy                           0.75      1646
   macro avg       0.70      0.53      0.56      1646
weighted avg       0.74      0.75      0.73      1646



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [40]:
ilp_lr_f1 = f1_score(Y_test,Y_LR_pred,average='weighted')

In [44]:
ilp_lr_f1

0.7327202128782889

### Naive Bayes

In [41]:
nb = MultinomialNB()
nb.fit(X_train,Y_train)
Y_NB_pred=nb.predict(X_test)
print(classification_report(Y_test,Y_NB_pred))

              precision    recall  f1-score   support

           0       0.68      0.56      0.62       186
           1       1.00      0.05      0.10        39
           2       0.65      0.59      0.62       213
           3       0.68      0.86      0.76       737
           4       0.69      0.25      0.37        99
           5       0.00      0.00      0.00        39
           6       0.61      0.60      0.60       333

    accuracy                           0.66      1646
   macro avg       0.62      0.42      0.44      1646
weighted avg       0.65      0.66      0.64      1646



  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
ilp_nb_f1 = f1_score(Y_test,Y_NB_pred,average='weighted')

In [45]:
ilp_nb_f1

0.6374377151945539