# Encoding Word Embeddings



URLS

# Constants

Here, you can alter the dataset, maximum number of samples to use, etc.

In [None]:
DMOZ, ILP, PHISHING = 'dmoz', 'ilp', 'phishing'

DATASET = ILP  # one of the above datsets
MAX_NUM_SAMPLES = 1_000_000

TEST_PROP = 0.2
VAL_PROP = 0.2
EPOCHS = 8

SEED = 42

## Setup


You will use the AdamW optimizer from [tensorflow/models](https://github.com/tensorflow/models).

In [None]:
GITHUB_TOKEN = 'fe2e680f071553cddb5f698cc58373a5106380d4'
command = f'git clone --depth 1 https://{GITHUB_TOKEN}@github.com/shmulvad/nlp-project.git'
!{command}

%cd nlp-project/src

Cloning into 'nlp-project'...
remote: Enumerating objects: 58, done.[K
remote: Counting objects: 100% (58/58), done.[K
remote: Compressing objects: 100% (47/47), done.[K
remote: Total 58 (delta 6), reused 33 (delta 3), pack-reused 0[K
Unpacking objects: 100% (58/58), done.
/content/nlp-project/src


In [None]:
!pip install gdown

# DMOZ, ILP and original phishing dataset - datasets.pkl
!gdown --id 1WV1JSevCnaWY0-mqQMmtOEFSC3Y_Qdg_

print('\n')
!ls

Downloading...
From: https://drive.google.com/uc?id=1WV1JSevCnaWY0-mqQMmtOEFSC3Y_Qdg_
To: /content/nlp-project/src/datasets.pkl
99.4MB [00:01, 97.8MB/s]


 baselines		    demo.ipynb	       self_trained_embeddings.py
'Colab Notebooks'	    featurizer.py      tests
 create_fasttext_embed.py   pickle_data.py     url_tokenizer.py
 data			    read_data.py       util.py
 datasets.pkl		    requirements.txt   word_embed


In [None]:
# A dependency of the preprocessing for BERT inputs
!pip install -q tensorflow-text
!pip install -q tf-models-official
!pip install -r requirements.txt

[K     |████████████████████████████████| 3.4MB 7.8MB/s 
[K     |████████████████████████████████| 1.1MB 8.3MB/s 
[K     |████████████████████████████████| 174kB 24.4MB/s 
[K     |████████████████████████████████| 37.6MB 1.4MB/s 
[K     |████████████████████████████████| 358kB 40.7MB/s 
[K     |████████████████████████████████| 102kB 11.0MB/s 
[K     |████████████████████████████████| 51kB 5.5MB/s 
[K     |████████████████████████████████| 1.2MB 40.9MB/s 
[K     |████████████████████████████████| 706kB 34.3MB/s 
[K     |████████████████████████████████| 645kB 41.0MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
Collecting wordninja
[?25l  Downloading https://files.pythonhosted.org/packages/30/15/abe4af50f4be92b60c25e43c1c64d08453b51e46c32981d80b3aebec0260/wordninja-2.0.0.tar.gz (541kB)
[K     |████████████████████████████████| 542kB 8.4MB/s 
Collecting gensim==4.0.1
[?25l  Downloading h

In [None]:
import os
import shutil
import random
import pickle

from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

from pprint import pprint
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optmizer

import gensim
from gensim.models.keyedvectors import FastTextKeyedVectors

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
tf.get_logger().setLevel('ERROR')
np.random.seed(SEED)

from read_data import read_all_datasets
from url_tokenizer import url_tokenizer, flatten_url_data
from featurizer import UrlFeaturizer, GLOVE, CONCEPTNET, SAMPLE



In [None]:
with open('datasets.pkl', 'rb') as f:
  datasets = pickle.load(f)

assert DATASET in datasets, \
  f'You tried to load {DATASET} but only {list(datasets.keys())} available.'

dataset = datasets[DATASET]
dataset

Unnamed: 0,idx,url,label,uni
6476,6476,http://www.cs.bu.edu/students/grads/dm/Home.html,student,misc
7615,7615,http://www.cs.umn.edu/Research/cpc/,project,misc
4962,4962,http://www.cs.wisc.edu/~dyer/cs540/courses.html,other,wisconsin
5089,5089,http://www.cs.wisc.edu/~greg/cs302/late.html,other,wisconsin
7102,7102,http://www.cs.washington.edu/homes/sds/,student,washington
...,...,...,...,...
2225,2225,http://www.tc.cornell.edu:80/DX/,other,cornell
8217,8217,http://www.cs.wisc.edu/~pdevries/pdevries.html,staff,wisconsin
6336,6336,http://www.cs.ucr.edu/~sparekh/,student,misc
3904,3904,http://www.cs.washington.edu/homes/marclang/re...,other,washington


In [None]:
# Download respectively model and ngrams
if DATASET == DMOZ:
  !gdown --id 1V8_EWQTF_JhgEVbXIvAiHViTHyIO9CBS
  !gdown --id 15EirC5KybMrG33RvYUXIfnBM2GWSBdh5
elif DATASET == ILP:
  !gdown --id 1_QafULaXKmq0H0fwY3dkR-WOKwsyUkHS
  !gdown --id 1kn4EbllSLdAX-8Ca5db6_BewLugCAWp5
else:  # Phishing
  !gdown --id 1JEsUG4eGqbIItBH468xph-AsS3iQ5Fif
  !gdown --id 10p0XPRsuTsXblQNNlUricEmd8if0B2P6

fast_text_embedding = FastTextKeyedVectors.load(f'embed-{DATASET}.model')

Downloading...
From: https://drive.google.com/uc?id=1_QafULaXKmq0H0fwY3dkR-WOKwsyUkHS
To: /content/nlp-project/src/embed-ilp.model
100% 528k/528k [00:00<00:00, 3.22MB/s]
Downloading...
From: https://drive.google.com/uc?id=1kn4EbllSLdAX-8Ca5db6_BewLugCAWp5
To: /content/nlp-project/src/embed-ilp.model.vectors_ngrams.npy
800MB [00:17, 45.3MB/s]


# Testing different Embeddings

In [None]:
import re
import pandas as pd

from tensorflow.keras.layers import LSTM,Input
from tensorflow.keras.models import Model

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix

from tqdm import tqdm

## Different approaches to Encode

#### Method 1 - FLatten

Here we simply flatten the word embeddings for each URL and append the handpicked featrues

Thus, the (31,100) embeddings are flattened to (3100,) vector.
Further appending the handpicked features makes the vector shape: (3120,)


In [None]:
url_regex = re.compile(r'''
        (https?):\/\/                                   # http s
        ([-a-zA-Z0-9@:%._\+~#=]+\.[a-zA-Z0-9()]{1,12})  # domains
        \b
        ([-a-zA-Z0-9()@:%_\+;.~#&//=]*)                 # path
        \??
        ([-a-zA-Z0-9()@:%_\+;.~#&//=?]*)                # args
    ''', re.DOTALL | re.VERBOSE)

def GenerateFeatureVector_Flatten(df, UF):
    df.reset_index(drop = True, inplace = True)
    df = pd.get_dummies(data=df, columns=['label'])
    label = df.drop(['idx','url','uni'],axis=1).values

    url_features = list()
    labels = list()

    for i in tqdm(range(df.shape[0])):
        url = df.iloc[i].url
        match = url_regex.match(url.lower())
        if match:
            features = UF.featurize(url)
            temp =   features[1].flatten()
            labels.append(label[i])
            url_features.append(np.concatenate((temp,features[0])))

    url_features = np.array(url_features)
    labels = np.array(labels)
    return url_features,labels

#### Enocoder 2 --> Piecewise

In [None]:
def valid_url(url):
  match = url_regex.match(url.lower())
  if match:
    return True
  else:
    return False

url_regex = re.compile(r'''
        (https?):\/\/                                   # http s
        ([-a-zA-Z0-9@:%._\+~#=]+\.[a-zA-Z0-9()]{1,12})  # domains
        \b
        ([-a-zA-Z0-9()@:%_\+;.~#&//=]*)                 # path
        \??
        ([-a-zA-Z0-9()@:%_\+;.~#&//=?]*)                # args
    ''', re.DOTALL | re.VERBOSE)

In [None]:
def GenerateFeatureVector_Enc2(dataset, UF, glove = False):
    df = dataset.copy()

    df.reset_index(drop = True, inplace = True)
    df['valid'] = df.url.apply(lambda x:valid_url(x))
    df = df[df['valid']==True]
    df.drop(['valid'],axis=1,inplace=True)
    df = pd.get_dummies(data=df, columns=['label'])
    labels = df.drop(['idx','url','uni'],axis=1).values

    raw_features = feat.featurize(np.array(df.url))

    embeddings = list()
    hand_picked = list()
    for feature in raw_features:
      embeddings.append(feature[1])
      hand_picked.append(feature[0])

    embeddings = np.array(embeddings)
    hand_picked = np.array(hand_picked)

    sub_domain = embeddings[:,:5,:]
    main_domain = embeddings[:,5:10,:]
    domain_end_vec = embeddings[:,10,:]
    path = embeddings[:,11:21,:]
    arg = embeddings[:,21:,:]

    if glove:
      inputs1 = Input(shape=(5, 300))
      lstm1 = LSTM(100)(inputs1)
      model1 = Model(inputs=inputs1, outputs=lstm1)

      inputs2 = Input(shape=(10, 300))
      lstm2 = LSTM(100)(inputs2)
      model2 = Model(inputs=inputs2, outputs=lstm2)
    else:
      inputs1 = Input(shape=(5, 100))
      lstm1 = LSTM(100)(inputs1)
      model1 = Model(inputs=inputs1, outputs=lstm1)

      inputs2 = Input(shape=(10, 100))
      lstm2 = LSTM(100)(inputs2)
      model2 = Model(inputs=inputs2, outputs=lstm2)

    sub_domain_encoded = model1.predict(sub_domain)
    main_domain_encoded = model1.predict(main_domain)
    path_encoded = model2.predict(path)
    arg_encoded = model2.predict(arg)

    features = np.concatenate([sub_domain_encoded,main_domain_encoded,domain_end_vec,path_encoded,arg_encoded],axis=1)

    return features,labels

##Fast Text

In [None]:
sampled_dataset = dataset.copy()

In [None]:
feat = UrlFeaturizer(embedding=fast_text_embedding)

Creating the average vector of all the word vectors...
Created FastText UrlFeaturizer in 0.0 s


### Testing Features by Encoding

In [None]:
url_features,labels = GenerateFeatureVector_Enc2(sampled_dataset,feat)

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(url_features,labels,test_size=0.2)
rf = RandomForestClassifier()
rf.fit(X_train,Y_train)
Y_RF_pred=rf.predict(X_test)
print(classification_report(Y_test,Y_RF_pred))

              precision    recall  f1-score   support

           0       0.89      0.72      0.79       186
           1       0.92      0.59      0.72        39
           2       0.82      0.54      0.65       213
           3       0.88      0.81      0.84       737
           4       0.82      0.36      0.50        99
           5       0.00      0.00      0.00        39
           6       0.76      0.65      0.70       333

   micro avg       0.85      0.68      0.75      1646
   macro avg       0.73      0.52      0.60      1646
weighted avg       0.82      0.68      0.74      1646
 samples avg       0.68      0.68      0.68      1646



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(f1_score(Y_test,Y_RF_pred,average = 'weighted'))

0.7399557043784316


## GLoVe Embeddings

In [None]:
feat = UrlFeaturizer(embedding='GloVe')

Reading the glove-wiki-gigaword-300 word vector file...
Creating the average vector of all the word vectors...
Created GloVe UrlFeaturizer in 280.0 s


### Testing featrues by Encoding

In [None]:
url_features,labels = GenerateFeatureVector_Enc2(sampled_dataset,feat,True)

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(url_features,labels,test_size=0.2)
rf = RandomForestClassifier()
rf.fit(X_train,Y_train)
Y_RF_pred=rf.predict(X_test)
print(classification_report(Y_test,Y_RF_pred))

              precision    recall  f1-score   support

           0       0.92      0.55      0.69       190
           1       1.00      0.57      0.72        37
           2       0.90      0.35      0.51       232
           3       0.87      0.84      0.85       747
           4       0.92      0.25      0.39        97
           5       0.00      0.00      0.00        27
           6       0.78      0.62      0.69       316

   micro avg       0.86      0.64      0.73      1646
   macro avg       0.77      0.45      0.55      1646
weighted avg       0.86      0.64      0.71      1646
 samples avg       0.64      0.64      0.64      1646



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(f1_score(Y_test,Y_RF_pred,average = 'weighted'))

0.7104947291378031


## Word2Vec Embeddings

In [None]:
feat = UrlFeaturizer(embedding='Word2Vec')

Reading the word2vec-google-news-300 word vector file...
Creating the average vector of all the word vectors...
Created Word2Vec UrlFeaturizer in 622.0 s


### Testing featrues by Encoding

In [None]:
import gc
dataset = None
datasets = None
gc.collect()

4450

In [None]:
url_features,labels = GenerateFeatureVector_Enc2(sampled_dataset,feat, True)

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(url_features,labels,test_size=0.2)
rf = RandomForestClassifier()
rf.fit(X_train,Y_train)
Y_RF_pred=rf.predict(X_test)
print(classification_report(Y_test,Y_RF_pred))

              precision    recall  f1-score   support

           0       0.91      0.71      0.80       174
           1       0.96      0.59      0.73        44
           2       0.81      0.27      0.41       234
           3       0.87      0.79      0.83       766
           4       0.79      0.20      0.32        94
           5       0.00      0.00      0.00        20
           6       0.72      0.55      0.62       314

   micro avg       0.84      0.61      0.71      1646
   macro avg       0.72      0.44      0.53      1646
weighted avg       0.82      0.61      0.68      1646
 samples avg       0.61      0.61      0.61      1646



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(f1_score(Y_test,Y_RF_pred,average = 'weighted'))

0.6839293796123764
