## In this notebook, all of the data preprocessing will be done.

In [11]:
 
# importing libraries

import csv
csv.field_size_limit(5000000)
import ast
import operator
import re
import threading
import random


import sys

import requests
from tqdm import tqdm 
from bs4 import BeautifulSoup

from concurrent.futures import ThreadPoolExecutor
import concurrent

from urllib.parse import urlparse
from urllib.parse import urljoin

import spacy # we use this for word similarity

from collections import defaultdict
import time


### The number of distinct websites used for the set + opening the dataset made in web_scraping_2.ipynb:


In [28]:
data = []
distinct_websites = set()

BAD_TEXT_PATTERNS_IN_TITLE = ['releases', 'products', 'collections', 'collection', 'item', 'personalization', 'personalize', 'personalized', 'customize', 'customized', 'customise', 'customised', 'shop', 'store', 'stores', 'home', 'page', 'pages', 'about', 'contact', 'contact us', 'contact me', 'contact info']

def clean_text(s):
    # Define the pattern to allow only "normal" characters and keep relevant punctuation
    allowed_pattern = r'[^a-zA-Z0-9\-_,./:&#%+=()\[\]\'\"| ]'
    # Replace irrelevant characters with empty string (i.e., remove them)
    return re.sub(allowed_pattern, '', s)

def get_base_url(url):
    try:
        parsed_url = urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        return base_url
    except Exception as e:
        # print(f"Error parsing URL {url}: {e}")
        return None

def literal_eval(item):
    try:
        # Check if the string looks like a tuple or list
        if item.startswith('(') and item.endswith(')'):
            return ast.literal_eval(item)
        return None
    except (ValueError, SyntaxError):
        # Return the original string if evaluation fails
        return None


with open('../data/preprocessed_data_from_all_sitemaps_100000.csv', 'r', encoding='utf-8', newline='') as file:
    reader = csv.reader(file)
    for row in reader:
        row[1] = literal_eval(row[1])
        if row[1] is None or 'G Plan Chloe' in row[1]: # this website haunts me in my data with G Plan Chloe
            continue
            
        ok = True
        for word in row[1][0]:
            if word in BAD_TEXT_PATTERNS_IN_TITLE:
                print(row[1])
                ok = False
                break
        if not ok:
            continue
    
        row[1] = (clean_text(row[1][0]), row[1][1], row[1][2])
        # finally we remove anything that has a len < 2
        if len(row[1][0].split()) < 2:
            continue
        print(row[1])
        distinct_websites.add(get_base_url(row[0]))
        for i, element in enumerate(row[2:], start=2):
            if element is not None:
                element = clean_text(element)
                row[i] = element

                
        data.append(row)
        
print(len(distinct_websites))
print(data[:10])

('New releases', 7, 19)
('Product Categories', 467, 485)
('HANGING SIGN-HAND CRAFTED HAND PAINTED-3FT WIDE BY 4FT TALL', 5, 64)
('Bed Collection', 1112, 1126)
('Pippy Oak Stool', 0, 15)
('Inns Bed', 1866, 1874)
('MARTINI TABLE IN GOLD', 12, 33)
('Middleham County Dining Table', 714, 743)
('Larkinhurst Rocker Recliner', 1392, 1419)
('BED | Four Poster in black by Uniqwa', 868, 904)
('60s Bed', 432, 439)
('Hexagon Corner Showcase', 1426, 1449)
('DDEN CONTAINER', 1212, 1227)
('Archie Shower Stool with rubber, padded feet', 1061, 1107)
('Winton Wicker Outdoor Chair', 344, 371)
('Dunlin Campaign Safari Chair American Oak', 56, 97)
('Furniture Cover - Medium (10.2ft x 6.0ft x 2.3ft)', 555, 604)
('Adeline Side Table', 36, 54)
('Smile pillow', 289, 301)
('AVATAR - Functional and modern corner sofa bed with FOOTSTOOL, Drawer and pull out bed', 450, 536)
('Hawthorne Sectional', 1310, 1329)
('Old Door Buffet', 1931, 1946)
("60'' ROUND COPPER TOP NO HOLE", 1042, 1071)
('24e Airplane Door Coffee Ta

In [17]:

for i in range(2):
    print(data[i])
    
for row in data[:10]:
    url, h1_tag_position, title, url_last_path, page_text = row
    print(h1_tag_position, type(h1_tag_position))
    

['https://www.skovby.com/en-gb/products/new-releases', ('New releases', 7, 19), 'New releases', 'new releases', 'Search New releases Here you can find the latest additions to the Skovby collection. Explore our new releases! SKOVBY #140 DINING TABLE Fixed top dining table for 6 people Read more SKOVBY #141 DINING TABLE Fixed top dining table for 8 people Read more SKOVBY #142 DINING TABLE Dining table for 6-10 people Read more SKOVBY #143 DINING TABLE Dining table for 8-12 people Read more SKOVBY #844 DINING CHAIR Harmonious dining chair Read more SKOVBY #842 DINING CHAIR Stackable dining chair Read more SKOVBY #107 DINING TABLE Rectangular dining table for 6-14 people Read more SKOVBY #108 DINING TABLE Rectangular dining table for 10-22 people Read more SKOVBY #308 CABINET Cabinet with impressive spaciousness Read more SKOVBY #810 DINING CHAIR Redefining Elegance with Modern Flair Read more SKOVBY #816 DINING CHAIR Ergonomic dining chair with classic wooden legs Read more SKOVBY #815 D

### A little bit of data preprocessing (i.e. manually removing data that is likely wrong by removing entries with less that 1 words in the title, with strange symbols, etc.)

By looking at the data manually I noticed what the wrong titles contain. The dataset will still be a bit noisy but I consider we have enough data to work with.

### This method will be used for labeling the data automatically

In [None]:
def tokenize_and_label(text, h1_tag_position, title, url_last_path, token_window=30):
    tokens = text.split()
    labels = ['O'] * len(tokens)  # Default all tokens to 'O'
    
    # Create character-to-token index mapping
    char_to_token_idx = []
    current_pos = 0
    for token in tokens:
        char_to_token_idx.append(current_pos)
        current_pos += len(token) + 1  # Adding 1 for the space separator
    
    # We only care about the first h1_tag position
    h1_text, start_idx, end_idx = h1_tag_position

    # Find the token indices corresponding to the h1_tag
    start_token_idx = None
    end_token_idx = None
    for i, char_idx in enumerate(char_to_token_idx):
        if start_token_idx is None and char_idx >= start_idx:
            start_token_idx = i
        if char_idx > end_idx:
            end_token_idx = i
            break

    if start_token_idx is not None:
        # Label the h1 tag tokens
        labels[start_token_idx] = 'B-PRODUCT'
        if end_token_idx is None:  # If the tag is the last part of the text
            end_token_idx = len(tokens)
        for j in range(start_token_idx + 1, end_token_idx):
            labels[j] = 'I-PRODUCT'

        # Trim the tokens to keep a window around the product
        start_window = max(0, start_token_idx - token_window)
        end_window = min(len(tokens), end_token_idx + token_window)
        
        # Trim the tokens and labels to the window
        tokens = tokens[start_window:end_window]
        labels = labels[start_window:end_window]
        
        # [URL] oak chair [URL] [TITLE] Oak Chair made by XYZ [TITLE] [TEXT] Oak Chair made by XYZ, Price: 40 euro etc [TEXT]
        
        if url_last_path:
            url_last_path = url_last_path.split()
        else: url_last_path = ['<NO_URL>']
        
        if title:
            title = title.split()
        else: title = ['<NO_TITLE>']
        
        # Add special tokens for the URL, title, and text
        url_last_path = url_last_path.split()
        title = title.split()
        
        
        tokens = ['[URL]'] + url_last_path + ['[URL]', '[TITLE]'] + title + ['[TITLE]', '[TEXT]'] + tokens + ['[TEXT]']
        
        # Adjust the labels to match the new tokens
        labels = ['O'] + ['0'] * len(url_last_path) + ['O', 'O'] + ['0'] * len(url_last_path) + ['O', 'O'] + labels + ['O'] 

    return tokens, labels