In [1]:
from typing import List, Union, Tuple, Any
import os
import csv
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

from tqdm import tqdm
import numpy as np
import pandas as pd

In [2]:
%%capture
# 'capture' silences output.
# The below cd is necessary to import local functions in parent directory
%cd ..

from read_data import read_ilp
from url_tokenizer import url_tokenizer, flatten_url_data
from featurizer import UrlFeaturizer, SAMPLE

In [3]:
URL_CLEANED = 'UrlCleaned'
URL_FEAT = 'UrlFeat'

VAL_PROP = 0.2

SEED = 42  # for reproducibility
np.random.seed(SEED)

In [23]:
feat = UrlFeaturizer(SAMPLE, verbose=False)

In [5]:
ilp = read_ilp().sample(frac=1, random_state=SEED)
ilp

Unnamed: 0,idx,url,label,uni
3078,3078,http://www.cs.tufts.edu/ad.html,other,misc
1172,1172,http://www.cs.uoregon.edu/classes/cis211/,course,misc
8214,8214,http://www.cs.washington.edu/homes/fisher/,staff,washington
496,496,http://swarm.wustl.edu/~roman/,faculty,misc
2132,2132,http://www.cs.cornell.edu/Info/People/ahuja/re...,other,cornell
...,...,...,...,...
5734,5734,http://www.cs.utexas.edu/users/nl-acq/prop-pap...,other,texas
5191,5191,http://www.cs.wisc.edu/coral/coral.bugreport.html,other,wisconsin
5390,5390,http://www.cs.utexas.edu/users/yufeng/cs378/wi...,other,texas
860,860,http://www.cs.umd.edu/~keleher/,faculty,misc


In [6]:
ilp.groupby(['label']).size()

label
course         928
department     182
faculty       1123
other         3717
project        502
staff          137
student       1641
dtype: int64

In [7]:
urls = ilp['url'].to_numpy()
urls

array(['http://www.cs.tufts.edu/ad.html',
       'http://www.cs.uoregon.edu/classes/cis211/',
       'http://www.cs.washington.edu/homes/fisher/', ...,
       'http://www.cs.utexas.edu/users/yufeng/cs378/winhw1.html',
       'http://www.cs.umd.edu/~keleher/',
       'http://www.cs.utexas.edu/users/madhukar/'], dtype=object)

In [8]:
urls_tokenized = []
urls_feat = []

for url in tqdm(urls):
    try:
        vec, _ = feat.featurize(url)
        urls_feat.append(vec)
        
        url_data = url_tokenizer(url)
        words = flatten_url_data(url_data)
        urls_tokenized.append(' '.join(words))
    except:
        urls_tokenized.append('error')
        
print(urls_tokenized[5])
print(len(urls_tokenized))

100%|██████████| 8230/8230 [00:02<00:00, 3946.83it/s]

http www cs cornell edu info people chandra do on html
8230





In [9]:
urls_feat = np.array(urls_feat)
urls_feat.shape

(8230, 11)

In [10]:
NUM_FEATS = urls_feat.shape[1]

ilp[URL_CLEANED] = urls_tokenized
for i in range(NUM_FEATS):
    ilp[f'{URL_FEAT}_{i}'] = urls_feat[:, i]
ilp

Unnamed: 0,idx,url,label,uni,UrlCleaned,UrlFeat_0,UrlFeat_1,UrlFeat_2,UrlFeat_3,UrlFeat_4,UrlFeat_5,UrlFeat_6,UrlFeat_7,UrlFeat_8,UrlFeat_9,UrlFeat_10
3078,3078,http://www.cs.tufts.edu/ad.html,other,misc,http www cs tufts edu ad html,0,1,2,1,0,2,1,0,0,0,0
1172,1172,http://www.cs.uoregon.edu/classes/cis211/,course,misc,http www cs u oregon edu classes c is 211,0,2,2,1,0,4,1,0,3,0,3
8214,8214,http://www.cs.washington.edu/homes/fisher/,staff,washington,http www cs washington edu homes fisher,0,1,2,1,0,2,1,0,0,0,0
496,496,http://swarm.wustl.edu/~roman/,faculty,misc,http swarm wu st l edu roman,0,3,1,0,0,1,1,0,0,0,0
2132,2132,http://www.cs.cornell.edu/Info/People/ahuja/re...,other,cornell,http www cs cornell edu info people ahu j a re...,0,1,2,1,0,7,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,5734,http://www.cs.utexas.edu/users/nl-acq/prop-pap...,other,texas,http www cs u texas edu users nl acq prop pape...,0,2,2,1,0,6,1,0,0,0,0
5191,5191,http://www.cs.wisc.edu/coral/coral.bugreport.html,other,wisconsin,http www cs wisc edu coral coral bug report html,0,1,2,1,0,5,1,0,0,0,0
5390,5390,http://www.cs.utexas.edu/users/yufeng/cs378/wi...,other,texas,http www cs u texas edu users yu feng cs 378 w...,0,2,2,1,0,10,1,0,4,0,4
860,860,http://www.cs.umd.edu/~keleher/,faculty,misc,http www cs umd edu kele her,0,1,2,1,0,2,1,0,0,0,0


In [11]:
extra_features = [f'{URL_FEAT}_{i}' for i in range(NUM_FEATS)]

In [12]:
def transform_func(x):
    return np.log1p(x + 1)


def fit_transform_nb(df):
    vectorizer_0 = TfidfVectorizer(
        analyzer='word',
        strip_accents='unicode'
    )
    column_trans = ColumnTransformer([
        ('tfidf_word', vectorizer_0, URL_CLEANED),
        #('feat', FunctionTransformer(transform_func), extra_features)
    ])
    column_trans.fit(df)
    mat = column_trans.transform(df)
    return mat, column_trans

In [13]:
X, y = ilp[[URL_CLEANED, *extra_features]], ilp['label']

X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=VAL_PROP, random_state=SEED)


X_train_pre, trans = fit_transform_nb(X_train)

In [14]:
X_train_pre

<6584x5219 sparse matrix of type '<class 'numpy.float64'>'
	with 58334 stored elements in Compressed Sparse Row format>

# Naive Bayes

### Train the model

In [15]:
alpha = 0.6

model = MultinomialNB(
    alpha=alpha,
    fit_prior=False,
).fit(X_train_pre, y_train)

### Test on train data

In [16]:
preds_train = model.predict(X_train_pre)

f1_train = f1_score(y_train, preds_train, average='macro')
acc_train = accuracy_score(y_train, preds_train)
print(f'F1  = {f1_train}\nAcc = {acc_train}')

F1  = 0.7254152539498947
Acc = 0.818955042527339


### Test on val data

In [17]:
X_val_pre = trans.transform(X_val)
preds_val = model.predict(X_val_pre)

f1_val_nb = f1_score(y_val, preds_val, average='macro')
acc_val_nb = accuracy_score(y_val, preds_val)
print(f'F1  = {f1_val_nb}\nAcc = {acc_val_nb}')

F1  = 0.48199705726025466
Acc = 0.6670716889428918


# Logistic Regression

In [18]:
def id_func(x):
    return x


def fit_transform_lr(df):
    vectorizer_0 = TfidfVectorizer(
        analyzer='word',
        strip_accents='unicode'
    )
    column_trans = ColumnTransformer([
        ('tfidf_word', vectorizer_0, URL_CLEANED),
        ('feat', FunctionTransformer(id_func), extra_features)
    ])
    column_trans.fit(df)
    mat = column_trans.transform(df)
    return mat, column_trans

In [19]:
X, y = ilp[[URL_CLEANED, *extra_features]], ilp['label']

X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=VAL_PROP, random_state=SEED)


X_train_pre, trans = fit_transform_lr(X_train)

### Train the model

In [20]:
C = 0.8

model = LogisticRegression(
    class_weight='balanced', C=C, max_iter=1200
).fit(X_train_pre, y_train)

### Test on train data

In [21]:
preds_train = model.predict(X_train_pre)

f1_train = f1_score(y_train, preds_train, average='macro')
acc_train = accuracy_score(y_train, preds_train)
print(f'F1  = {f1_train}\nAcc = {acc_train}')

F1  = 0.7771079304763951
Acc = 0.8165249088699879


### Test on val data

In [22]:
X_val_pre = trans.transform(X_val)
preds_val = model.predict(X_val_pre)

f1_val_lr = f1_score(y_val, preds_val, average='macro')
acc_val_lr = accuracy_score(y_val, preds_val)
print(f'F1  = {f1_val_lr}\nAcc = {acc_val_lr}')

F1  = 0.5781321808415699
Acc = 0.6907654921020656


# Support Vector Machine

In [73]:
def id_func(x):
    return x


def fit_transform_svm(df):
    vectorizer_0 = TfidfVectorizer(
        analyzer='word',
        strip_accents='unicode'
    )
    column_trans = ColumnTransformer([
        ('tfidf_word', vectorizer_0, URL_CLEANED),
        ('feat', FunctionTransformer(id_func), extra_features)
    ])
    column_trans.fit(df)
    mat = column_trans.transform(df)
    return mat, column_trans

In [74]:
X, y = ilp[[URL_CLEANED, *extra_features]], ilp['label']

X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=VAL_PROP, random_state=SEED)


X_train_pre, trans = fit_transform_svm(X_train)

### Train the model

In [79]:
model = svm.LinearSVC(
    random_state=SEED, max_iter=5000
).fit(X_train_pre, y_train)



### Test on train data

In [80]:
preds_train = model.predict(X_train_pre)

f1_train = f1_score(y_train, preds_train, average='macro')
acc_train = accuracy_score(y_train, preds_train)
print(f'F1  = {f1_train}\nAcc = {acc_train}')

F1  = 0.9526877568588937
Acc = 0.9647630619684082


### Test on val data

In [81]:
X_val_pre = trans.transform(X_val)
preds_val = model.predict(X_val_pre)

f1_val_svm = f1_score(y_val, preds_val, average='macro')
acc_val_svm = accuracy_score(y_val, preds_val)
print(f'F1  = {f1_val_svm}\nAcc = {acc_val_svm}')

F1  = 0.5874028949464528
Acc = 0.7339003645200486


# Write the data to disk

In [87]:
accs = {
    'NB': {'F1': f1_val_nb, 'Val Acc': acc_val_nb},
    'LR': {'F1': f1_val_lr, 'Val Acc': acc_val_lr},
    'SVM': {'F1': f1_val_svm, 'Val Acc': acc_val_svm}
}

with open('baselines/results/ilp.json', 'w') as f:
    f.write(json.dumps(accs))