In [13]:
import os
os.getcwd()

'/Users/izapreev/Projects/SimpDOM'

In [1]:
# Make sure the source code auto reloads into the kernel
%load_ext autoreload
%autoreload 2

In [2]:
# To help preventing shared maemory errors
!ulimit -n 500000
import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')

In [3]:
import torch
import random
import pickle

import numpy as np
import pandas as pd
import torch.nn as nn
import pytorch_lightning as pl

from torch.utils.data import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint

from Utils.logger import logger
from train_and_eval import load_dict
from Model.SimpDOM_model import SeqModel
from Prediction.test_step import main as get_predictions
from Utils.pretrainedGloVe import pretrainedWordEmeddings
from DataLoader.swde_dataLoader import swde_data_test, collate_fn_test

In [4]:
# Model Configurations

datapath = './data'
random.seed(7)
device = 'cpu'

n_workers=0 # Important to keep this at zero as otherwise we get a shared memory error
n_gpus=0
char_emb_dim = 16
char_hid_dim = 100
char_emb_dropout = 0.1

tag_emb_dim = 16
tag_hid_dim = 30

leaf_emb_dim = 30
pos_emb_dim = 20
word_emb_filename= f'{datapath}/glove.6B.100d.txt'

train_websites = ['auto-aol','auto-yahoo','auto-motortrend','auto-autobytel', 'auto-msn', ]
val_websites = ['auto-aol','auto-yahoo']
attributes = ['model', 'price', 'engine', 'fuel_economy']
n_classes = len(attributes)+1
class_weights = [1,100,100,100,100]

In [5]:
# Load the pre-trained embeddings
charDict = load_dict(f'{datapath}/English_charDict.pkl')
tagDict = load_dict(f'{datapath}/HTMLTagDict.pkl')
WordEmeddings = pretrainedWordEmeddings(word_emb_filename)

11:28:35 INFO (train_and_eval:36): Loading ./data/English_charDict.pkl
11:28:35 INFO (train_and_eval:38): Dictionary ./data/English_charDict.pkl length: 85
11:28:35 INFO (train_and_eval:36): Loading ./data/HTMLTagDict.pkl
11:28:35 INFO (train_and_eval:38): Dictionary ./data/HTMLTagDict.pkl length: 39
11:28:35 INFO (pretrainedGloVe:9): Loading pretrained word emeddings from: ./data/glove.6B.100d.txt
11:28:39 INFO (pretrainedGloVe:22): Loaded 400000 pretrained word vectors


In [6]:
logger.info(f'Start generating test dataset')
test_dataset = DataLoader(dataset = swde_data_test(val_websites, datapath, charDict, \
                                  tagDict, n_gpus, WordEmeddings), num_workers=n_workers, \
                                  batch_size=32, shuffle=False, pin_memory = True, collate_fn = collate_fn_test)
logger.info(f'Finished creating the test dataset!')

11:28:39 INFO (981812523:1): Start generating test dataset
11:28:39 INFO (swde_dataLoader:261): Start loading data set for websites: ['auto-aol', 'auto-yahoo']


Web site:   0%|          | 0/2 [00:00<?, ?it/s]

Web pages:   0%|          | 0/2000 [00:00<?, ?it/s]

Web pages:   0%|          | 0/2000 [00:00<?, ?it/s]

11:28:54 INFO (981812523:5): Finished creating the test dataset!


 397233 - nodes are loaded in swde_dataLoader_test


In [7]:
logger.info('Instantiating the Model checkpoint')
checkpoint_callback = ModelCheckpoint(
    filename='./data/weights',
    save_top_k=1,
    save_last = True,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

config = {
    'out_dim': n_classes,
    'train_websites': train_websites,
    'val_websites': val_websites,
    'datapath': datapath,
    'n_workers': n_workers,
    'charDict' : charDict,
    'char_emb_dim' : char_emb_dim,
    'char_hid_dim' : char_hid_dim,
    'char_emb_dropout' : char_emb_dropout,
    'tagDict': tagDict,
    'tag_emb_dim': tag_emb_dim,
    'tag_hid_dim': tag_hid_dim,
    'leaf_emb_dim': leaf_emb_dim,
    'pos_emb_dim': pos_emb_dim,
    'attributes': attributes,
    'n_gpus' : n_gpus,
    'class_weights':class_weights,
    'word_emb_filename': word_emb_filename
}
logger.info(f'Model config: {config}')

11:28:54 INFO (2635199416:1): Instantiating the Model checkpoint
11:28:54 INFO (2635199416:31): Model config: {'out_dim': 5, 'train_websites': ['auto-aol', 'auto-yahoo', 'auto-motortrend', 'auto-autobytel', 'auto-msn'], 'val_websites': ['auto-aol', 'auto-yahoo'], 'datapath': './data', 'n_workers': 0, 'charDict': {'o': 1, 'g': 2, 'n': 3, 'e': 4, '\n': 5, 'c': 6, 'k': 7, '0': 8, '1': 9, 'b': 10, 'm': 11, '-': 12, '2': 13, 'y': 14, 'l': 15, ' ': 16, 'u': 17, 'h': 18, 'a': 19, 's': 20, 'r': 21, 'w': 22, 'p': 23, 't': 24, 'i': 25, 'f': 26, 'd': 27, '&': 28, 'v': 29, 'q': 30, '.': 31, '4': 32, ':': 33, '3': 34, '°': 35, '6': 36, 'x': 37, '$': 38, '5': 39, 'j': 40, 'z': 41, '9': 42, ',': 43, '8': 44, '?': 45, '7': 46, "'": 47, ')': 48, '(': 49, '\t': 50, '@': 51, '/': 52, '%': 53, '*': 54, '=': 55, '®': 56, '©': 57, '–': 58, '"': 59, ';': 60, '!': 61, '+': 62, '{': 63, '|': 64, '»': 65, '’': 66, '>': 67, '#': 68, '<': 69, '—': 70, '[': 71, ']': 72, 'ü': 73, '“': 74, '”': 75, '≈': 76, '_': 77,

In [19]:
logger.info('Loading the Sequential model from Checkpoint')
pre_trained_model_weights = f'{datapath}/weights.ckpt'
#pre_trained_model_weights = 'weights_wpix_manual_ckpt.ckpt' # This one is to test the re-trianed model
model = SeqModel.load_from_checkpoint(pre_trained_model_weights, config=config)
model = model.eval()
model = model.to(device)

15:25:03 INFO (1667893443:1): Loading the Sequential model from Checkpoint
15:25:03 INFO (pretrainedGloVe:9): Loading pretrained word emeddings from: ./data/glove.6B.100d.txt
15:25:07 INFO (pretrainedGloVe:22): Loaded 400000 pretrained word vectors


In [20]:
logger.info('Generating model predictions')
df = get_predictions(test_dataset, model, device, 0.6)

15:25:07 INFO (743827731:1): Generating model predictions


Testing batches:   0%|          | 0/12414 [00:00<?, ?it/s]

In [21]:
dump_file_name = 'test_predictions.csv'
logger.info(f'Dumping predictions dataframe into: {dump_file_name}')
df.to_csv(dump_file_name)

15:38:38 INFO (1285270816:2): Dumping predictions dataframe into: test_predictions.csv


In [22]:
from Prediction.PRSummary import cal_PR_summary
avg_prf1_dict = cal_PR_summary(df, n_classes)
logger.info(f'Prediction summary:\n{avg_prf1_dict}')

15:38:44 INFO (3452662240:3): Prediction summary:
{1: (0.5870069605568445, 0.5694618272841051, 0.5781013026927674), 2: (0.6041169451073986, 0.50625, 0.5508705114254625), 3: (0.9382561535252398, 0.1265, 0.22294194408359308), 4: (0.9994998749687422, 0.999479843953186, 0.9994898593606025)}


class - 1: precision = 0.5870069605568445, recall = 0.5694618272841051, F1 = 0.5781013026927674
class - 2: precision = 0.6041169451073986, recall = 0.50625, F1 = 0.5508705114254625
class - 3: precision = 0.9382561535252398, recall = 0.1265, F1 = 0.22294194408359308
class - 4: precision = 0.9994998749687422, recall = 0.999479843953186, F1 = 0.9994898593606025


In [23]:
from Prediction.WebsiteLevel_PR_Generator import cal_PR_summary as websiteLevel_cal_PR_summary
pr_summary_df, pr_results_df = websiteLevel_cal_PR_summary(df, n_classes)
logger.info(f'Website-level prediction summary:\n{pr_results_df}')

15:52:26 INFO (1702364033:3): Website-level prediction summary:
      website  attribute  precision    recall  retrieved_relevant  retrieved  \
0    auto-aol          1   0.359759  0.359398                 717       1993   
1    auto-aol          2   1.000000  1.000000                2000       2000   
2    auto-aol          3   0.000000  0.000000                   0          4   
3    auto-aol          4   0.998916  0.998916                1843       1845   
4  auto-yahoo          1   0.826964  0.779000                1558       1884   
5  auto-yahoo          2   0.018491  0.012500                  25       1352   
6  auto-yahoo          3   0.637280  0.126500                 253        397   
7  auto-yahoo          4   1.000000  1.000000                2000       2000   

   relevant  
0      1995  
1      2000  
2         0  
3      1845  
4      2000  
5      2000  
6      2000  
7      2000  
