## In this notebook, all of the data preprocessing will be done.

In [59]:
 
# importing libraries

import csv
csv.field_size_limit(5000000)
import ast
import operator
import re
import threading
import random
from rapidfuzz import fuzz


import sys

import requests
from tqdm import tqdm 
from bs4 import BeautifulSoup

from concurrent.futures import ThreadPoolExecutor
import concurrent

from urllib.parse import urlparse
from urllib.parse import urljoin

import spacy # we use this for word similarity

from collections import defaultdict
import time


### A little bit of data preprocessing (i.e. manually removing data that is likely wrong by removing entries with less that 1 words in the title, with strange symbols, etc.)
By looking at the data manually I noticed what the wrong titles contain. The dataset will still be a bit noisy but I consider we have enough data to work with.
At the bottom of the code cell you can also see the number of distinct websites in the dataset just so you can get an idea of the diversity of the dataset. When I wrote the code it was 288 / 705 (the initial dataset). This should be enough diversity for the model to learn from.


In [107]:
data = []
distinct_websites = set()

BAD_TEXT_PATTERNS_IN_TITLE = ['releases', 'products', 'product', 'collections', 'collection', 'item', 'personalization', 'personalize', 'personalized', 'customize', 'customized', 'customise', 'customised', 'shop', 'store', 'stores', 'home', 'page', 'pages', 'about', 'contact', 'contact us', 'contact me', 'contact info', 'furniture', 'sofas', 'chairs', 'armchairs', 'ottomans', 'furniture' 'gift', 'card', ] #  all generic names that would indicate that the h1 tag does not contain a product - we can afford to lose a few products in the dataset

def clean_text(s):
    # Define the pattern to allow only "normal" characters and keep relevant punctuation
    allowed_pattern = r"[^a-zA-Z0-9\s,.:;\'\"!?()\-&+]"

    # Replace irrelevant characters with empty string (i.e., remove them)
    return re.sub(allowed_pattern, '', s)

def get_base_url(url):
    try:
        parsed_url = urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        return base_url
    except Exception as e:
        # print(f"Error parsing URL {url}: {e}")
        return None

def literal_eval(item):
    try:
        # Check if the string looks like a tuple or list
        if item.startswith('(') and item.endswith(')'):
            return ast.literal_eval(item)
        return None
    except (ValueError, SyntaxError):
        # Return the original string if evaluation fails
        return None
    
def find_all_h1_positions(text, h1_tag): # original_position is a tuple (start, end)
    positions = []
    start_pos = 0
    # Search for all occurrences of h1_tag in text
    while True:
        start_idx = text.find(h1_tag, start_pos)
        if start_idx == -1:
            break
        end_idx = start_idx + len(h1_tag) 
        positions.append((start_idx, end_idx))
        # Move start position forward to search for the next occurrence
        start_pos = end_idx + 1
    return h1_tag, positions


with open('../data/preprocessed_data_from_all_sitemaps_100000.csv', 'r', encoding='utf-8', newline='') as file:
    reader = csv.reader(file)
    for row in reader:
        row[1] = literal_eval(row[1])
        if row[1] is None:
            continue
        row[1] = (clean_text(row[1][0]), row[1][1], row[1][2]) # url, h1_tag_position, title, url_last_path, page_text = row
        row[2] = clean_text(row[2])
        row[3] = clean_text(row[3])
        row[4] = clean_text(row[4])
        
        h1_tag, positions = find_all_h1_positions(row[4], row[1][0])
        row[1] = (h1_tag, positions) # string with list
        if row[1] is None or row[1][1] is None or 'G Plan Chloe' in row[1]: # this website haunts me in my data with G Plan Chloe
            continue
        
        # finally we remove anything that has a len < 2
        if len(row[1][0].split()) < 2:
            continue
        
        ok = True
        for word in row[1][0].split():
            if word.lower() in BAD_TEXT_PATTERNS_IN_TITLE:
                ok = False
                break
        if not ok:
            continue    
        
        distinct_websites.add(get_base_url(row[0]))
        data.append(row)
        
print(len(distinct_websites))

286


### This method will be used for labeling the data automatically

I have decided to not label the title and the url last path in this version since they are quite inconsistent with the label I am looking for inside the text (i.e. the product name which is most of the time the h1 tag inside the text). That is what I consider the most accurate representation of the "product name" in the text.

In [108]:
def fuzzy_match(text_tokens, h1_tokens, similarity_threshold=80):
    longest_match = None
    max_similarity = similarity_threshold  # Initialize with the threshold
    
    for i in range(len(text_tokens)):
        for j in range(i+1, len(text_tokens)+1):
            # Join text token sub-sequence
            text_sub_seq = ' '.join(text_tokens[i:j])
            # Compare with h1 tokens using fuzzy matching
            similarity = fuzz.partial_ratio(text_sub_seq.lower(), ' '.join(h1_tokens).lower())
            
            # If the similarity is above the threshold, check if it's the longest match
            if similarity >= similarity_threshold:
                if longest_match is None or len(text_sub_seq) > len(longest_match[0]):
                    longest_match = (text_sub_seq, similarity)
                elif len(text_sub_seq) == len(longest_match[0]) and similarity > max_similarity:
                    longest_match = (text_sub_seq, similarity)
                    max_similarity = similarity

    return longest_match if longest_match else None


def label_text(text, to_label): 
    # Split the main text and the text to be labeled into tokens
    tokens = text.split()
    label_tokens = to_label.split()

    # Initialize the labels list with 'O' for 'Outside'
    labels = ['O'] * len(tokens)
    # Iterate through the tokens and label all occurrences of the label_tokens
    i = 0
    while i < len(tokens):
        # Convert both the token slice and label tokens to lowercase for comparison
        if [t.lower() for t in tokens[i:i+len(label_tokens)]] == [lt.lower() for lt in label_tokens]:
            labels[i] = 'B-PRODUCT'  # Mark the beginning of the label
            for j in range(1, len(label_tokens)):
                labels[i+j] = 'I-PRODUCT'  # Mark the rest of the label tokens
            i += len(label_tokens)  # Skip the tokens that were labeled
        else:
            i += 1  # Move to the next token

    return tokens, labels

def label_url_or_title(text, to_label):
    # Split the text and label into tokens
    tokens = text.split()
    label_tokens = to_label.split()

    # Initialize labels with 'O' for 'Outside'
    labels = ['O'] * len(tokens)

    # Perform fuzzy matching between the text and the label
    match_result = fuzzy_match(tokens, label_tokens)

    if match_result is None or len(match_result[0].split()) < 3: # We don't want small matches since they are wrong most of the time
        # print("No match found")
        return tokens, labels
    # Get the best matching subsequence and its similarity score
    matched_subseq, similarity = match_result
    # Tokenize the matched subsequence
    matched_tokens = matched_subseq.split()
    
    # Find the starting index of the matched subsequence in the main text
    start_index = None
    for i in range(len(tokens) - len(matched_tokens) + 1):
        if tokens[i:i+len(matched_tokens)] == matched_tokens:
            start_index = i
            break
    if start_index is not None:
        # Label the matched subsequence in the main text
        labels[start_index] = 'B-PRODUCT'
        for j in range(1, len(matched_tokens)):
            labels[start_index + j] = 'I-PRODUCT'

    return tokens, labels



In [110]:
def tokenize_and_label(text, h1_tag_position, title, url_last_path, tokens_left=15, tokens_right=25):
    to_label = h1_tag_position[0]
    
    tokens_url_last_path, labels_url_last_path = [], []
    tokens_title, labels_title = [], []
    
    if url_last_path is None or url_last_path == '':
        tokens_url_last_path = ['<NO_URL>']
        labels_url_last_path = ['O']
    elif isinstance(url_last_path, str):
        tokens_url_last_path, labels_url_last_path = label_text(url_last_path, to_label)
        is_any_non_o_label = any(label != 'O' for label in labels_url_last_path)
        if not is_any_non_o_label:
            tokens_url_last_path, labels_url_last_path = label_url_or_title(url_last_path, to_label)
    else: 
        tokens_url_last_path = ['<NO_URL>']
        labels_url_last_path = ['O']
    
    if title is None or title == '':
        tokens_title = ['<NO_TITLE>']
        labels_title = ['O']
    elif isinstance(title, str):
        tokens_title, labels_title = label_text(title, to_label)
        is_any_non_o_label = any(label != 'O' for label in labels_title)
        if not is_any_non_o_label:
            tokens_title, labels_title = label_url_or_title(title, to_label)
    else: 
        tokens_title = ['<NO_TITLE>']
        labels_title = ['O']
    
    tokens_text, labels_text = label_text(text, to_label)
    
    try:
        first_label_index = labels_text.index('B-PRODUCT')
    except ValueError:
        first_label_index = 0  # If no labeled entity, start from the beginning
    try:
        last_label_index = max(idx for idx, label in enumerate(labels_text) if label in ['B-PRODUCT', 'I-PRODUCT'])
    except ValueError:
        last_label_index = len(tokens_text) - 1  # If no labeled entity, end at the last token

    # Calculate the window to slice
    start_index = max(0, first_label_index - tokens_left)
    end_index = min(len(tokens_text), last_label_index + tokens_right + 1) # or len(tokens_text) 


    tokens_text = tokens_text[start_index:end_index]
    labels_text = labels_text[start_index:end_index]
    
    end_of_window = 100
    while end_of_window < len(labels_text) and labels_text[end_of_window] != 'O':
        end_of_window +=1
    
    if end_of_window < len(labels_text):
        tokens_text = tokens_text[:end_of_window]
        labels_text = labels_text[:end_of_window]
    
    if random.randint(1, 100) == 1:
        tokens_title = ['<NO_TITLE>']
        labels_title = ['O']
    
    tokens = ['[URL]'] + tokens_url_last_path + ['[URL]', '[TITLE]'] + tokens_title + ['[TITLE]', '[TEXT]'] + tokens_text + ['[TEXT]']
    
    labels = ['O'] + labels_url_last_path + ['O', 'O'] + labels_title + ['O', 'O'] + labels_text + ['O']
    

    return tokens, labels


In [111]:
with open('../data/100000_data_ready_for_training.csv', 'w', encoding='utf-8', newline='') as file:
    writer = csv.writer(file)
    for row in data:
        url, h1_tag_positions, title, url_last_path, page_text = row
        
        tokens, labels = tokenize_and_label(page_text, h1_tag_positions, title, url_last_path)
        # tokens, labels = clean_tokens_with_labels(tokens, labels)
        tokens_str = ' '.join(tokens)  # Join tokens into a single string
        labels_str = ' '.join(labels)  # Join labels into a single string
        writer.writerow([url, tokens_str, labels_str])




In [101]:
preprocessed_data = []

with open('../data/100000_data_ready_for_training.csv', 'r', encoding='utf-8', newline='') as file:
    reader = csv.reader(file)
    for row in reader:
        url, tokens_str, labels_str = row
        tokens = tokens_str.split(' ') # !!!
        labels = labels_str.split(' ')
        #  see if there is any sequence of tokens different of 'O' that are of distinct lengths of each other (not all of same length)
        preprocessed_data.append((url, tokens, labels))

https://www.myharmony.hk/products/inns-bed ['Inns', 'Bed'] ['B-PRODUCT', 'I-PRODUCT']
https://www.myharmony.hk/products/inns-bed ['Bed'] ['I-PRODUCT']
https://www.popandscott.com/products/p-s-bed-double ['60s', 'Bed'] ['B-PRODUCT', 'I-PRODUCT']
https://www.popandscott.com/products/p-s-bed-double ['Bed'] ['I-PRODUCT']
https://mulamu.com/products/iedden-container ['DDEN', 'CONTAINER'] ['B-PRODUCT', 'I-PRODUCT']
https://mulamu.com/products/iedden-container ['CONTAINER'] ['I-PRODUCT']
https://shopspencerfurnituresiouxfalls.com/products/smile-pillow ['Smile', 'pillow'] ['B-PRODUCT', 'I-PRODUCT']
https://shopspencerfurnituresiouxfalls.com/products/smile-pillow ['pillow'] ['I-PRODUCT']
https://www.wardrobe-bunk-bed-sofa.uk/products/avant-functional-modern-corner-sofa-bed-footstool-drawer-pull-out ['AVATAR', '-', 'Functional', 'and', 'modern', 'corner', 'sofa', 'bed', 'with', 'FOOTSTOOL,', 'Drawer', 'and', 'pull', 'out', 'bed'] ['B-PRODUCT', 'I-PRODUCT', 'I-PRODUCT', 'I-PRODUCT', 'I-PRODUCT', 

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



https://atouchoffurniture.co.uk/products/lundy-pine-painted-single-mirror ['Lundy', 'Pine', 'Painted', 'Single', 'Mirror'] ['B-PRODUCT', 'I-PRODUCT', 'I-PRODUCT', 'I-PRODUCT', 'I-PRODUCT']
https://atouchoffurniture.co.uk/products/lundy-pine-painted-single-mirror ['Pine', 'Painted', 'Single', 'Mirror'] ['I-PRODUCT', 'I-PRODUCT', 'I-PRODUCT', 'I-PRODUCT']
https://atouchoffurniture.co.uk/products/lundy-pine-painted-single-mirror ['Painted', 'Single', 'Mirror'] ['I-PRODUCT', 'I-PRODUCT', 'I-PRODUCT']
https://atouchoffurniture.co.uk/products/lundy-pine-painted-single-mirror ['Single', 'Mirror'] ['I-PRODUCT', 'I-PRODUCT']
https://atouchoffurniture.co.uk/products/lundy-pine-painted-single-mirror ['Mirror'] ['I-PRODUCT']
https://www.greensladesfurniture.co.nz/products/concrete-round-side-table-46cm ['Round', 'Concrete', 'Outdoor', 'Side', 'Table'] ['B-PRODUCT', 'I-PRODUCT', 'I-PRODUCT', 'I-PRODUCT', 'I-PRODUCT']
https://www.greensladesfurniture.co.nz/products/concrete-round-side-table-46cm ['C

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [93]:
tokens, labels = label_url_or_title(' stool 2 in natural', ' stool with woven leather')
print(tokens, labels)

['stool', '2', 'in', 'natural'] ['O', 'O', 'O', 'O']
