In [1]:
from typing import List, Union, Tuple, Any
import os
import csv
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

from tqdm import tqdm
import numpy as np
import pandas as pd

In [2]:
%%capture
# 'capture' silences output.
# The below cd is necessary to import local functions in parent directory
%cd ..

from read_data import read_dmoz
from url_tokenizer import url_tokenizer, flatten_url_data
from featurizer import UrlFeaturizer, SAMPLE

In [3]:
URL_CLEANED = 'UrlCleaned'
URL_FEAT = 'UrlFeat'

NUM_URLS = 200_000
VAL_PROP = 0.2

SEED = 42  # for reproducibility
np.random.seed(SEED)

In [4]:
feat = UrlFeaturizer(SAMPLE, verbose=False)

In [5]:
dmoz = read_dmoz()
dmoz

Unnamed: 0,idx,url,label
0,1,http://www.liquidgeneration.com/,Adult
1,2,http://www.onlineanime.org/,Adult
2,3,http://www.ceres.dti.ne.jp/~nekoi/senno/senfir...,Adult
3,4,http://www.galeon.com/kmh/,Adult
4,5,http://www.fanworkrecs.com/,Adult
...,...,...,...
1562973,1562974,http://www.maxpreps.com/,Sports
1562974,1562975,http://www.myscore.com/,Sports
1562975,1562976,http://sportsillustrated.cnn.com/highschool,Sports
1562976,1562977,http://rss.cnn.com/rss/si_highschool?format=xml,Sports


In [6]:
# # dmoz_shuffled = dmoz.sample(frac=1).reset_index(drop=True)
dmoz.groupby(['label']).size()

label
Adult          35322
Arts          253811
Business      240173
Computers     117670
Games          56453
Health         60096
Home           28267
Kids           46182
News            8989
Recreation    106579
Reference      58241
Science       110255
Shopping       95270
Society       243871
Sports        101327
dtype: int64

In [7]:
dmoz_sampled = dmoz.sample(NUM_URLS, random_state=SEED)
dmoz_sampled

Unnamed: 0,idx,url,label
543210,543211,http://www.boogaholler.com/boogaframeset.html,Computers
302083,302084,http://www.hornershearing.com,Business
934647,934648,http://www.c-and-e-museum.org/te_tp9.htm,Recreation
44167,44168,http://www.stomptokyo.com/tvdiary/columns99/fg...,Arts
249269,249270,http://www.theatrehistory.com/british/musical0...,Arts
...,...,...,...
153270,153271,http://www.angelfire.com/me2/luxxrocks/,Arts
227236,227237,http://movies.yahoo.com/shop?d=hc&amp;cf=gen&a...,Arts
1190141,1190142,http://www.angelfire.com/pq/prophits/theprophe...,Shopping
837828,837829,http://www.freewebs.com/godlygirls/,Kids


In [8]:
urls = dmoz_sampled['url'].to_numpy()
urls

array(['http://www.boogaholler.com/boogaframeset.html',
       'http://www.hornershearing.com',
       'http://www.c-and-e-museum.org/te_tp9.htm', ...,
       'http://www.angelfire.com/pq/prophits/theprophetparody.html',
       'http://www.freewebs.com/godlygirls/',
       'http://www.arkanoidgames4pc.com/'], dtype=object)

In [9]:
urls_tokenized = []
urls_feat = []

for url in tqdm(urls):
    try:
        vec, _ = feat.featurize(url)
        urls_feat.append(vec)
        
        url_data = url_tokenizer(url)
        words = flatten_url_data(url_data)
        urls_tokenized.append(' '.join(words))
    except:
        urls_tokenized.append('error')
        
print(urls_tokenized[5])
print(len(urls_tokenized))

100%|██████████| 200000/200000 [00:45<00:00, 4414.63it/s]

http members iinet net au bush index html
200000





In [10]:
urls_feat = np.array(urls_feat)
urls_feat.shape

(200000, 11)

In [11]:
NUM_FEATS = urls_feat.shape[1]

dmoz_sampled[URL_CLEANED] = urls_tokenized
for i in range(NUM_FEATS):
    dmoz_sampled[f'{URL_FEAT}_{i}'] = urls_feat[:, i]
dmoz_sampled

Unnamed: 0,idx,url,label,UrlCleaned,UrlFeat_0,UrlFeat_1,UrlFeat_2,UrlFeat_3,UrlFeat_4,UrlFeat_5,UrlFeat_6,UrlFeat_7,UrlFeat_8,UrlFeat_9,UrlFeat_10
543210,543211,http://www.boogaholler.com/boogaframeset.html,Computers,http www boog a holler com boog a frame set html,0,3,1,1,0,5,1,0,0,0,0
302083,302084,http://www.hornershearing.com,Business,http www horner shearing com,0,2,1,1,0,0,1,0,0,0,0
934647,934648,http://www.c-and-e-museum.org/te_tp9.htm,Recreation,http www c and e museum org te tp 9 htm,0,4,1,1,0,4,1,0,1,0,1
44167,44168,http://www.stomptokyo.com/tvdiary/columns99/fg...,Arts,http www stomp tokyo com tv diary columns 99 f...,0,2,1,1,0,9,1,0,2,0,2
249269,249270,http://www.theatrehistory.com/british/musical0...,Arts,http www theatre history com british musical 0...,0,2,1,1,0,4,1,0,3,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153270,153271,http://www.angelfire.com/me2/luxxrocks/,Arts,http www angel fire com me2 lux x rocks,0,2,1,1,0,4,1,0,1,0,1
227236,227237,http://movies.yahoo.com/shop?d=hc&amp;cf=gen&a...,Arts,http movies yahoo com shop d hc cf gen id 1800...,0,1,1,0,0,1,1,0,0,10,10
1190141,1190142,http://www.angelfire.com/pq/prophits/theprophe...,Shopping,http www angel fire com pq prop hits the proph...,0,2,1,1,0,7,1,0,0,0,0
837828,837829,http://www.freewebs.com/godlygirls/,Kids,http www free webs com godly girls,0,2,1,1,0,2,1,0,0,0,0


In [12]:
dmoz_sampled.groupby(['label']).size()

label
Adult          4483
Arts          32321
Business      30494
Computers     15098
Games          7216
Health         7751
Home           3613
Kids           5965
News           1155
Recreation    13730
Reference      7458
Science       13966
Shopping      12220
Society       31447
Sports        13083
dtype: int64

In [13]:
extra_features = [f'{URL_FEAT}_{i}' for i in range(NUM_FEATS)]

In [14]:
def transform_func(x):
    return np.log1p(x + 1)


def fit_transform_nb(df):
    vectorizer_0 = TfidfVectorizer(
        analyzer='word',
        strip_accents='unicode'
    )
    column_trans = ColumnTransformer([
        ('tfidf_word', vectorizer_0, URL_CLEANED),
        #('feat', FunctionTransformer(transform_func), extra_features)
    ])
    column_trans.fit(df)
    mat = column_trans.transform(df)
    return mat, column_trans

In [15]:
X, y = dmoz_sampled[[URL_CLEANED, *extra_features]], dmoz_sampled['label']

X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=VAL_PROP, random_state=SEED)


X_train_pre, trans = fit_transform_nb(X_train)

In [16]:
X_train_pre

<160000x60923 sparse matrix of type '<class 'numpy.float64'>'
	with 1090310 stored elements in Compressed Sparse Row format>

# Naive Bayes

### Train the model

In [17]:
alpha = 0.6

model = MultinomialNB(
    alpha=alpha,
    fit_prior=False,
).fit(X_train_pre, y_train)

### Test on train data

In [18]:
preds_train = model.predict(X_train_pre)

f1_train = f1_score(y_train, preds_train, average='macro')
acc_train = accuracy_score(y_train, preds_train)
print(f'F1  = {f1_train}\nAcc = {acc_train}')

F1  = 0.6556236346889878
Acc = 0.6927875


### Test on val data

In [19]:
X_val_pre = trans.transform(X_val)
preds_val = model.predict(X_val_pre)

f1_val_nb = f1_score(y_val, preds_val, average='macro')
acc_val_nb = accuracy_score(y_val, preds_val)
print(f'F1  = {f1_val_nb}\nAcc = {acc_val_nb}')

F1  = 0.47014416139700704
Acc = 0.5188


# Logistic Regression

In [20]:
def id_func(x):
    return x


def fit_transform_lr(df):
    vectorizer_0 = TfidfVectorizer(
        analyzer='word',
        strip_accents='unicode'
    )
    column_trans = ColumnTransformer([
        ('tfidf_word', vectorizer_0, URL_CLEANED),
        ('feat', FunctionTransformer(id_func), extra_features)
    ])
    column_trans.fit(df)
    mat = column_trans.transform(df)
    return mat, column_trans

In [21]:
X, y = dmoz_sampled[[URL_CLEANED, *extra_features]], dmoz_sampled['label']

X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=VAL_PROP, random_state=SEED)


X_train_pre, trans = fit_transform_lr(X_train)

### Train

In [22]:
C = 0.8

model = LogisticRegression(
    class_weight='balanced', C=C, max_iter=1200
).fit(X_train_pre, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Test on train data

In [23]:
preds_train = model.predict(X_train_pre)

f1_train = f1_score(y_train, preds_train, average='macro')
acc_train = accuracy_score(y_train, preds_train)
print(f'F1  = {f1_train}\nAcc = {acc_train}')

F1  = 0.6234236157275748
Acc = 0.6258875


### Test on val data

In [24]:
X_val_pre = trans.transform(X_val)
preds_val = model.predict(X_val_pre)

f1_val_lr = f1_score(y_val, preds_val, average='macro')
acc_val_lr = accuracy_score(y_val, preds_val)
print(f'F1  = {f1_val_lr}\nAcc = {acc_val_lr}')

F1  = 0.4761802655757706
Acc = 0.503225


# Support Vector Machine

In [25]:
def id_func(x):
    return x


def fit_transform_svm(df):
    vectorizer_0 = TfidfVectorizer(
        analyzer='word',
        strip_accents='unicode'
    )
    column_trans = ColumnTransformer([
        ('tfidf_word', vectorizer_0, URL_CLEANED),
        ('feat', FunctionTransformer(id_func), extra_features)
    ])
    column_trans.fit(df)
    mat = column_trans.transform(df)
    return mat, column_trans

In [26]:
X, y = dmoz_sampled[[URL_CLEANED, *extra_features]], dmoz_sampled['label']

X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=VAL_PROP, random_state=SEED)


X_train_pre, trans = fit_transform_lr(X_train)

### Train the model

In [27]:
model = svm.LinearSVC(
    random_state=SEED, max_iter=1000
).fit(X_train_pre, y_train)



### Test on train data

In [28]:
preds_train = model.predict(X_train_pre)

f1_train = f1_score(y_train, preds_train, average='macro')
acc_train = accuracy_score(y_train, preds_train)
print(f'F1  = {f1_train}\nAcc = {acc_train}')

F1  = 0.8074260141975116
Acc = 0.8063


### Test on val data

In [29]:
X_val_pre = trans.transform(X_val)
preds_val = model.predict(X_val_pre)

f1_val_svm = f1_score(y_val, preds_val, average='macro')
acc_val_svm = accuracy_score(y_val, preds_val)
print(f'F1  = {f1_val_svm}\nAcc = {acc_val_svm}')

F1  = 0.5131819049474287
Acc = 0.54285


# Write the data to disk

In [30]:
accs = {
    'NB': {'F1': f1_val_nb, 'Val Acc': acc_val_nb},
    'LR': {'F1': f1_val_lr, 'Val Acc': acc_val_lr},
    'SVM': {'F1': f1_val_svm, 'Val Acc': acc_val_svm}
}

with open('baselines/results/dmoz.json', 'w') as f:
    f.write(json.dumps(accs))