In [1]:
import json
import os
import random
import numpy as np
import networkx as nx
import pandas as pd
import pickle
from sklearn.metrics import roc_auc_score, f1_score
from tqdm import tqdm, trange

import yaml
import ast

from joblib import Parallel, delayed
from multiprocessing import Queue, Process, Pool

import torch
import torch.nn as nn
from torch_geometric.utils.convert import from_networkx
from torch_geometric.loader import DataLoader

import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, GATv2Conv, SAGEConv

torch.cuda.is_available()

True

# Pipeline

In [2]:
filepath = './data'

## сайты в графы

In [31]:
# полезные функции
class Point:
    def __init__(self, x, y):
        self.x = x
        self.y = y


def rgba2rgb(rgba: tuple[int, int, int, float], background: tuple[int, int, int] = (255, 255, 255)):
    return [
        round(((1 - rgba[3]) * background[0]) + (rgba[3] * rgba[0])),
        round(((1 - rgba[3]) * background[1]) + (rgba[3] * rgba[1])),
        round(((1 - rgba[3]) * background[2]) + (rgba[3] * rgba[2])),
    ]


def check_overlap(l1, r1, l2, r2):
    rect1_corners = [(l1.x, l1.y), (r1.x, r1.y)]
    rect2_corners = [(l2.x, l2.y), (r2.x, r2.y)]
    
    for corner in rect1_corners:
        if (corner[0] >= rect2_corners[0][0] and corner[0] <= rect2_corners[1][0] and
            corner[1] >= rect2_corners[0][1] and corner[1] <= rect2_corners[1][1]):
            return True
    
    for corner in rect2_corners:
        if (corner[0] >= rect1_corners[0][0] and corner[0] <= rect1_corners[1][0] and
            corner[1] >= rect1_corners[0][1] and corner[1] <= rect1_corners[1][1]):
            return True

    return False

In [33]:
%%time

graphs_folder = './graphs'

def func(file):
    df = pd.read_csv(f'{filepath}/{file}')
    
    fl = file[:-4]
    if not os.path.exists(graphs_folder):
        os.makedirs(graphs_folder)
    
    for site in range(len(df)):
        blocks = df['Blocks'][site]
        blocks = ast.literal_eval(blocks)
        #print(blocks)
        
        label_one = df['MatchingBlocks'][site]
        
        good_blocks = []
        for b in blocks:
            if (blocks[b]['p_tag'] != 'script' and blocks[b]['p_tag'] != 'link' and
                blocks[b]['p_tag'] != 'style' and 
                (blocks[b]['size']['width'] * blocks[b]['size']['height']) != 0):
                
                good_blocks.append(b)

        G = nx.Graph()
        for i in range(len(good_blocks)):
            cur_dict = blocks[good_blocks[i]]
            
            location = cur_dict['location']
    
            size_w = cur_dict['size']['width']
            size_h = cur_dict['size']['height']

            if size_w * size_h == 0:
                fullness = 0
            else:
                fullness = round(float(cur_dict['font-size'][:-2]) * cur_dict['length'] / (size_w * size_h), 2)

            font_color = cur_dict['font-color']
            if font_color.find('rgba') != -1:
                font_color = font_color[5:-1].split(',')
                font_color = list(map(float, font_color))
                font_color = rgba2rgb(font_color)
            else:
                font_color = font_color[4:-1].split(',')
                font_color = list(map(float, font_color))

            bg_color = cur_dict['bg-color']
            if bg_color.find('rgba') != -1:
                bg_color = bg_color[5:-1].split(',')
                bg_color = list(map(float, bg_color))
                bg_color = rgba2rgb(bg_color)
            else:
                bg_color = bg_color[4:-1].split(',')
                bg_color = list(map(float, bg_color))

            color = [el1 - el2 for (el1, el2) in zip(font_color, bg_color)]

            has_img = int(cur_dict['hasImg'])

            #y = cur_dict['label']
            if good_blocks[i] in label_one:
                y = 1;
            else:
                y = 0;

            G.add_node(i,
                       x = [location['x'], location['y'], size_w, size_h, fullness,
                            color[0], color[1], color[2], has_img],
                       y = y)

        for i in range(len(good_blocks)):
            l1 = Point(blocks[good_blocks[i]]['location']['x'],
                        blocks[good_blocks[i]]['location']['y'])

            r1 = Point(blocks[good_blocks[i]]['location']['x'] + blocks[good_blocks[i]]['size']['width'],
                        blocks[good_blocks[i]]['location']['y'] + blocks[good_blocks[i]]['size']['height'])

            for j in range(i+1, len(good_blocks)):
                l2 = Point(blocks[good_blocks[j]]['location']['x'],
                            blocks[good_blocks[j]]['location']['y'])

                r2 = Point(blocks[good_blocks[j]]['location']['x'] + blocks[good_blocks[j]]['size']['width'],
                            blocks[good_blocks[j]]['location']['y'] + blocks[good_blocks[j]]['size']['height'])

                if check_overlap(l1, r1, l2, r2):
                    G.add_edge(i, j)

        #G = G.subgraph(nx.node_connected_component(G, 0))
        if nx.number_connected_components(G) > 0:
            G_sorted = sorted(nx.connected_components(G), key=len, reverse=True)
            G = G.subgraph(G_sorted[0])
            pickle.dump(G, open(f'./{graphs_folder}/{fl}_{site}.pkl', 'wb'))


result = Parallel(n_jobs=-1)(delayed(func)(file) for file in os.listdir(filepath))

CPU times: user 105 ms, sys: 225 ms, total: 331 ms
Wall time: 10min 5s


## проверка качества

In [34]:
comps = 0
comp_size = 0
nodes = 0
edges = 0
label = 0
    
for grph in tqdm(os.listdir(graphs_folder)):
    G = pickle.load(open(f'./{graphs_folder}/{grph}', 'rb'))
        
    for component in nx.connected_components(G):
        comp_size += len(component)
        
    comps += nx.number_connected_components(G)
    nodes += G.number_of_nodes()
    edges += G.number_of_edges()
        
    for n in G.nodes:
        label += G.nodes[n]['y']


num = len(os.listdir(graphs_folder))
    
print('comp num average:\t', comps/num)
print('comp size average:\t', comp_size/num)
print('node num average:\t',nodes/num)
print('edge num average:\t', edges/num)
print('labels average:\t\t', label/num)
print('labels total:\t\t', label)
#print()

100%|███████████████████████████████████████| 1824/1824 [00:45<00:00, 40.38it/s]

comp num average:	 1.0
comp size average:	 785.2708333333334
node num average:	 785.2708333333334
edge num average:	 13104.327302631578
labels average:		 47.0515350877193
labels total:		 85822





# DL магия

### данные

In [43]:
graphs_folder = './graphs'

In [44]:
data_list = []
for grph in tqdm(os.listdir(graphs_folder)):
    G = pickle.load(open(f'{graphs_folder}/{grph}', 'rb'))
    
    lbl = 0
    for n in G.nodes:
        lbl += G.nodes[n]['y']
    
    if lbl > 0:
        data_list.append(from_networkx(G))


random.Random(42).shuffle(data_list)
print(len(data_list))

100%|███████████████████████████████████████| 1824/1824 [02:07<00:00, 14.27it/s]

1500





In [45]:
train_loader = DataLoader(data_list[:1125], batch_size=125, shuffle=True)
test_loader = DataLoader(data_list[1125:], batch_size=125, shuffle=True)

### Модели

In [8]:
class GCN(torch.nn.Module):
    def __init__(self, hidden):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(9, hidden)
        self.conv2 = GCNConv(hidden, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [9]:
class GCN_1(torch.nn.Module):
    def __init__(self):
        super(GCN_1, self).__init__()
        self.conv1 = GCNConv(9, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)

        return F.log_softmax(x, dim=1)

In [10]:
class SAGE(torch.nn.Module):
    def __init__(self, hidden):
        super(SAGE, self).__init__()
        self.conv1 = SAGEConv(9, hidden)
        self.conv2 = SAGEConv(hidden, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [11]:
class SAGE_1(torch.nn.Module):
    def __init__(self):
        super(SAGE_1, self).__init__()
        self.conv1 = SAGEConv(9, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)

        return F.log_softmax(x, dim=1)

In [12]:
class GAT(torch.nn.Module):
    def __init__(self, hidden):
        super(GAT, self).__init__()
        self.conv1 = GATConv(9, hidden)
        self.conv2 = GATConv(hidden, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [13]:
class GAT_1(torch.nn.Module):
    def __init__(self):
        super(GAT_1, self).__init__()
        self.conv1 = GATConv(9, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)

        return F.log_softmax(x, dim=1)

In [14]:
class GATv2(torch.nn.Module):
    def __init__(self, hidden):
        super(GATv2, self).__init__()
        self.conv1 = GATv2Conv(9, hidden)
        self.conv2 = GATv2Conv(hidden, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [15]:
class GATv2_1(torch.nn.Module):
    def __init__(self):
        super(GATv2_1, self).__init__()
        self.conv1 = GATv2Conv(9, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)

        return F.log_softmax(x, dim=1)

### Обучение

In [46]:
def loss_func(output, y_batch):
    return torch.nn.NLLLoss()(output, y_batch)

In [54]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def best_model_creation(hidden, epochs_num, f1_best, path, name):
    flag = 0
    f1 = round(sum(f1_best) / len(f1_best), 2)
    
    for i in range(100):
        model = SAGE(hidden).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

        for epoch in range(epochs_num):
            # train
            for dt in train_loader:
                data = dt.to(device)

                optimizer.zero_grad()
                out = model(data)
                loss = loss_func(out, data.y)
                loss.backward()
                optimizer.step()
        
            # test
            f1_log, acc_log, roc_auc_log = [], [], []
            model.eval()

            for dt in test_loader:
                data = dt.to(device)

                output = model(data)
                pred = pred = torch.argmax(output, dim=1)

                f1_log.append(f1_score(data.y.cpu().numpy().flatten(), pred.cpu().numpy().flatten()))

                if np.std(data.y.cpu().numpy().flatten()) == 0:
                    roc_auc = 0
                else:
                    roc_auc = roc_auc_score(data.y.cpu().numpy().flatten(),
                                                nn.Softmax(dim=1)(output)[:, 1].detach().cpu().numpy().flatten())

                roc_auc_log.append(roc_auc)

                acc = torch.mean((pred == data.y).float())
                acc_log.append(acc.cpu().numpy())

            if ((sum(f1_log) / len(f1_log)) > (sum(f1_best) / len(f1_best))):
                flag = 1
                f1_best = f1_log
                f1 = round(sum(f1_best) / len(f1_best), 3)
                roc_auc_best = round(sum(roc_auc_log) / len(roc_auc_log), 3)
                epoch_best = epoch
                best_model = model
    
    if flag == 1:
        print('hidden:\t', hidden)
        print('epoch:\t', epoch_best)
        print('f1:\t', f1)
        print('roc-auc:', roc_auc_best)
        print()
        
        torch.save(best_model.state_dict(), f'{path}/{name}_{hidden}_{epoch_best}_{f1}_{roc_auc_best}.graph.state')
        torch.save(best_model, f'{path}/{name}_{hidden}_{epoch_best}_{f1}_{roc_auc_best}.graph')
    
    return f1_best


f1_best = [0.2]
for h in trange(4, 8):
    f1_best = best_model_creation(h, 10, f1_best, './best_models/SAGE', 'SAGE')

 25%|███████████                                 | 1/4 [02:06<06:18, 126.04s/it]

hidden:	 4
epoch:	 2
f1:	 0.558
roc-auc: 0.741



 50%|██████████████████████                      | 2/4 [04:10<04:10, 125.12s/it]

hidden:	 5
epoch:	 3
f1:	 0.614
roc-auc: 0.76



 75%|█████████████████████████████████           | 3/4 [06:17<02:06, 126.00s/it]

hidden:	 6
epoch:	 6
f1:	 0.621
roc-auc: 0.841



100%|████████████████████████████████████████████| 4/4 [08:24<00:00, 126.15s/it]

hidden:	 7
epoch:	 6
f1:	 0.621
roc-auc: 0.846






In [112]:
torch.cuda.empty_cache()

In [86]:
loader = DataLoader(data_list, batch_size=147, shuffle=True)
model = torch.load('./best_models/SAGE/SAGE_4_5_0.653_0.916.graph')
model.eval()

roc_auc_log, f1_log, acc_log = [], [], []

for dt in loader:
    data = dt.to(device)

    output = model(data)
    pred = pred = torch.argmax(output, dim=1)

    f1_log.append(f1_score(data.y.cpu().numpy().flatten(), pred.cpu().numpy().flatten()))

    if np.std(data.y.cpu().numpy().flatten()) == 0:
        roc_auc = 0
    else:
        roc_auc = roc_auc_score(data.y.cpu().numpy().flatten(),
                                    nn.Softmax(dim=1)(output)[:, 1].detach().cpu().numpy().flatten())

    roc_auc_log.append(roc_auc)

    acc = torch.mean((pred == data.y).float())
    acc_log.append(acc.cpu().numpy())


print('SAGE')
print('acc:\t', round(sum(acc_log) / len(acc_log), 2))
print('f1:\t', round(sum(f1_log) / len(f1_log), 2))
print('roc_auc:', round(sum(roc_auc_log) / len(roc_auc_log), 2))

acc:	 0.95
f1:	 0.67
roc_auc: 0.91


# DOM дерево

In [49]:
graphs_folder = './graphs_dom'

In [35]:
%%time

graphs_folder = './graphs_dom'

def func(file):
    df = pd.read_csv(f'{filepath}/{file}')
    
    fl = file[:-4]
    if not os.path.exists(graphs_folder):
        os.makedirs(graphs_folder)
    
    for site in range(len(df)):
        blocks = df['Blocks'][site]
        blocks = ast.literal_eval(blocks)
        #print(blocks)
        
        label_one = df['MatchingBlocks'][site]
        
        good_blocks = []
        for b in blocks:
            if (blocks[b]['p_tag'] != 'script' and blocks[b]['p_tag'] != 'link' and
                blocks[b]['p_tag'] != 'style' and 
                (blocks[b]['size']['width'] * blocks[b]['size']['height']) != 0):
                
                good_blocks.append(b)

        G = nx.Graph()
        node_dict = {}
        for i in range(len(good_blocks)):
            cur_dict = blocks[good_blocks[i]]
            
            node_dict[good_blocks[i]] = i
            
            location = cur_dict['location']
    
            size_w = cur_dict['size']['width']
            size_h = cur_dict['size']['height']

            if size_w * size_h == 0:
                fullness = 0
            else:
                fullness = round(float(cur_dict['font-size'][:-2]) * cur_dict['length'] / (size_w * size_h), 2)

            font_color = cur_dict['font-color']
            if font_color.find('rgba') != -1:
                font_color = font_color[5:-1].split(',')
                font_color = list(map(float, font_color))
                font_color = rgba2rgb(font_color)
            else:
                font_color = font_color[4:-1].split(',')
                font_color = list(map(float, font_color))

            bg_color = cur_dict['bg-color']
            if bg_color.find('rgba') != -1:
                bg_color = bg_color[5:-1].split(',')
                bg_color = list(map(float, bg_color))
                bg_color = rgba2rgb(bg_color)
            else:
                bg_color = bg_color[4:-1].split(',')
                bg_color = list(map(float, bg_color))

            color = [el1 - el2 for (el1, el2) in zip(font_color, bg_color)]

            has_img = int(cur_dict['hasImg'])

            #y = cur_dict['label']
            if good_blocks[i] in label_one:
                y = 1;
            else:
                y = 0;

            G.add_node(i,
                       x = [location['x'], location['y'], size_w, size_h, fullness,
                            color[0], color[1], color[2], has_img],
                       y = y)

        for i in range(len(good_blocks)):
            parent = good_blocks[i][:-(len(good_blocks[i].split('/')[-1]) + 1)]
            if parent in node_dict:
                G.add_edge(node_dict[good_blocks[i]], node_dict[parent])

        #G = G.subgraph(nx.node_connected_component(G, 0))
        if nx.number_connected_components(G) > 0:
            G_sorted = sorted(nx.connected_components(G), key=len, reverse=True)
            G = G.subgraph(G_sorted[0])
            pickle.dump(G, open(f'./{graphs_folder}/{fl}_{site}.pkl', 'wb'))

        
result = Parallel(n_jobs=-1)(delayed(func)(file) for file in os.listdir(filepath))

CPU times: user 39.8 ms, sys: 196 ms, total: 236 ms
Wall time: 2min 26s


### проверка качества

In [36]:
comps = 0
comp_size = 0
nodes = 0
edges = 0
label = 0
    
for grph in tqdm(os.listdir(graphs_folder)):
    G = pickle.load(open(f'./{graphs_folder}/{grph}', 'rb'))
        
    for component in nx.connected_components(G):
        comp_size += len(component)
        
    comps += nx.number_connected_components(G)
    nodes += G.number_of_nodes()
    edges += G.number_of_edges()
        
    for n in G.nodes:
        label += G.nodes[n]['y']


num = len(os.listdir(graphs_folder))
    
print('comp num average:\t', comps/num)
print('comp size average:\t', comp_size/num)
print('node num average:\t',nodes/num)
print('edge num average:\t', edges/num)
print('labels average:\t\t', label/num)
print('labels total:\t\t', label)
#print()

100%|██████████████████████████████████████| 1824/1824 [00:15<00:00, 120.85it/s]

comp num average:	 1.0
comp size average:	 775.2549342105264
node num average:	 775.2549342105264
edge num average:	 774.2549342105264
labels average:		 48.50109649122807
labels total:		 88466





### Данные

In [50]:
data_list = []
for grph in tqdm(os.listdir(graphs_folder)):
    G = pickle.load(open(f'{graphs_folder}/{grph}', 'rb'))
    
    lbl = 0
    for n in G.nodes:
        lbl += G.nodes[n]['y']
    
    if lbl > 0:
        data_list.append(from_networkx(G))


random.Random(42).shuffle(data_list)
print(len(data_list))

100%|███████████████████████████████████████| 1824/1824 [00:29<00:00, 61.76it/s]

1602





In [51]:
train_loader = DataLoader(data_list[:1202], batch_size=121, shuffle=True)
test_loader = DataLoader(data_list[1202:], batch_size=80, shuffle=True)

### Обучение

In [39]:
def loss_func(output, y_batch):
    return torch.nn.NLLLoss()(output, y_batch)

In [53]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def best_model_creation(hidden, epochs_num, f1_best, path, name):
    flag = 0
    f1 = round(sum(f1_best) / len(f1_best), 2)
    
    for i in range(100):
        model = SAGE(hidden).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

        for epoch in range(epochs_num):
            # train
            for dt in train_loader:
                data = dt.to(device)

                optimizer.zero_grad()
                out = model(data)
                loss = loss_func(out, data.y)
                loss.backward()
                optimizer.step()
        
            # test
            f1_log, acc_log, roc_auc_log = [], [], []
            model.eval()

            for dt in test_loader:
                data = dt.to(device)

                output = model(data)
                pred = pred = torch.argmax(output, dim=1)

                f1_log.append(f1_score(data.y.cpu().numpy().flatten(), pred.cpu().numpy().flatten()))

                if np.std(data.y.cpu().numpy().flatten()) == 0:
                    roc_auc = 0
                else:
                    roc_auc = roc_auc_score(data.y.cpu().numpy().flatten(),
                                                nn.Softmax(dim=1)(output)[:, 1].detach().cpu().numpy().flatten())

                roc_auc_log.append(roc_auc)

                acc = torch.mean((pred == data.y).float())
                acc_log.append(acc.cpu().numpy())

            if ((sum(f1_log) / len(f1_log)) > (sum(f1_best) / len(f1_best))):
                flag = 1
                f1_best = f1_log
                f1 = round(sum(f1_best) / len(f1_best), 3)
                roc_auc_best = round(sum(roc_auc_log) / len(roc_auc_log), 3)
                epoch_best = epoch
                best_model = model
    
    if flag == 1:
        print('hidden:\t', hidden)
        print('epoch:\t', epoch_best)
        print('f1:\t', f1)
        print('roc-auc:', roc_auc_best)
        print()
        
        torch.save(best_model.state_dict(), f'{path}/{name}_{hidden}_{epoch_best}_{f1}_{roc_auc_best}.graph.state')
        torch.save(best_model, f'{path}/{name}_{hidden}_{epoch_best}_{f1}_{roc_auc_best}.graph')
    
    return f1_best


f1_best = [0.2]
for h in trange(4, 8):
    f1_best = best_model_creation(h, 10, f1_best, './best_models_dom/SAGE', 'SAGE')

 25%|███████████▎                                 | 1/4 [01:00<03:01, 60.44s/it]

hidden:	 4
epoch:	 1
f1:	 0.582
roc-auc: 0.85



 75%|█████████████████████████████████▊           | 3/4 [03:04<01:01, 61.58s/it]

hidden:	 6
epoch:	 3
f1:	 0.585
roc-auc: 0.778



100%|█████████████████████████████████████████████| 4/4 [04:07<00:00, 61.94s/it]

hidden:	 7
epoch:	 7
f1:	 0.614
roc-auc: 0.847




