In [1]:
import os
import csv

import numpy as np
import pandas as pd

from read_data import read_all_datasets
from url_tokenizer import url_tokenizer
from featurizer import UrlFeaturizer, GLOVE, CONCEPTNET

In [2]:
dmoz, phishing, ilp = read_all_datasets(use_sample=False)

In [3]:
urls = list(dmoz['url'].to_numpy())

for url in urls[:-30:-1]:
    tokens = url_tokenizer(url)
    print(url, tokens)

http://www.usatoday.com/sports/preps/ ('http', (['www'], ['usa', 'today'], 'com'), ['sports', 'preps'], [])
http://rss.cnn.com/rss/si_highschool?format=xml ('http', (['rss'], ['cnn'], 'com'), ['rss', 'si', 'highschool'], [(['format'], ['xml'])])
http://sportsillustrated.cnn.com/highschool ('http', (['sports', 'illustrated'], ['cnn'], 'com'), ['highschool'], [])
http://www.myscore.com/ ('http', (['www'], ['my', 'score'], 'com'), [], [])
http://www.maxpreps.com/ ('http', (['www'], ['max', 'preps'], 'com'), [], [])
http://www.hssp.cc/ ('http', (['www'], ['hssp'], 'cc'), [], [])
http://www.infosports.com/ ('http', (['www'], ['info', 'sports'], 'com'), [], [])
http://www.fciac.net/ ('http', (['www'], ['fc', 'iac'], 'net'), [], [])
http://aenwebsites.com/tsl/ ('http', ([], ['a', 'en', 'websites'], 'com'), ['tsl'], [])
http://www.aodonline.org/chsl/chsl.htm ('http', (['www'], ['a', 'odon', 'line'], 'org'), ['chsl', 'ch', 'sl', 'htm'], [])
http://www.gisastats.com ('http', (['www'], ['g', 'is'

In [4]:
test_idx = 44

In [5]:
url = urls[test_idx]
url

'http://www.three-musketeers.net/mike/animeopinions.html'

In [6]:
feat = UrlFeaturizer(GLOVE)

Reading the glove-wiki-gigaword-300 word vector file...
Creating the average vector of all the word vectors...
Created GloVe UrlFeaturizer in 96.3 s


In [7]:
url_feats = feat.featurize(urls[:50])
len(url_feats)

50

In [8]:
vec, mat = url_feats[test_idx]
print(vec)
print(mat.shape)
print(mat[:2][:15])

[0 2 1 1 0 4 1 0 0 0 0 0 9]
(31, 300)
[[ 0.11432    -0.14889     0.18291    -0.26842999  0.11861     0.66561002
  -0.59749001 -0.10577    -0.014548    0.28941    -0.12970001  0.074891
  -0.43320999 -0.057939    0.29025999  0.086225   -0.14473    -0.11384
  -0.38073     1.01419997  0.056362    0.63062    -0.16997001 -0.25104001
  -0.18337999  0.31681001  0.78151    -0.27998999  0.088714    0.1657
   0.57279998 -0.07964     0.50918001  0.14729001  0.40996999 -0.2088
  -0.23492    -0.19464    -0.26890999 -0.33414     0.09558    -0.20982
  -0.29330999  0.36688    -0.041555   -0.72365999  0.62620002 -0.15138
  -0.43889999 -0.14891     0.405       0.0096935   0.68958998 -0.74760002
  -0.49936     0.097788    0.077734   -0.47279    -0.32644001 -0.11835
  -0.23736    -0.28724    -0.89468002 -0.027208    0.29578     0.0068149
   0.59254003  0.18092     0.32188001 -0.20669     0.20333     0.41376999
   0.42392001 -0.15727     0.17524999  0.01274     0.40242001 -0.1406
  -0.40309     0.042242   -