## In this notebook, all of the data preprocessing will be done.

In [75]:
 
# importing libraries

import csv
csv.field_size_limit(5000000)
import ast
import operator
import re
import threading
import random


import sys

import requests
from tqdm import tqdm 
from bs4 import BeautifulSoup

from concurrent.futures import ThreadPoolExecutor
import concurrent

from urllib.parse import urlparse
from urllib.parse import urljoin

import spacy # we use this for word similarity

from collections import defaultdict
import time


### A little bit of data preprocessing (i.e. manually removing data that is likely wrong by removing entries with less that 1 words in the title, with strange symbols, etc.)
By looking at the data manually I noticed what the wrong titles contain. The dataset will still be a bit noisy but I consider we have enough data to work with.
At the bottom of the code cell you can also see the number of distinct websites in the dataset just so you can get an idea of the diversity of the dataset. When I wrote the code it was 288 / 705 (the initial dataset). This should be enough diversity for the model to learn from.


In [76]:
data = []
distinct_websites = set()

BAD_TEXT_PATTERNS_IN_TITLE = ['releases', 'products', 'product', 'collections', 'collection', 'item', 'personalization', 'personalize', 'personalized', 'customize', 'customized', 'customise', 'customised', 'shop', 'store', 'stores', 'home', 'page', 'pages', 'about', 'contact', 'contact us', 'contact me', 'contact info']

def clean_text(s):
    # Define the pattern to allow only "normal" characters and keep relevant punctuation
    allowed_pattern = r"[^a-zA-Z0-9\s,.:;\'\"!?()\-&+]"

    # Replace irrelevant characters with empty string (i.e., remove them)
    return re.sub(allowed_pattern, '', s)

def get_base_url(url):
    try:
        parsed_url = urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        return base_url
    except Exception as e:
        # print(f"Error parsing URL {url}: {e}")
        return None

def literal_eval(item):
    try:
        # Check if the string looks like a tuple or list
        if item.startswith('(') and item.endswith(')'):
            return ast.literal_eval(item)
        return None
    except (ValueError, SyntaxError):
        # Return the original string if evaluation fails
        return None
    
def find_all_h1_positions(text, h1_tag): # original_position is a tuple (start, end)
    positions = []
    start_pos = 0

    # Search for all occurrences of h1_tag in text
    while True:
        start_idx = text.find(h1_tag, start_pos)
        
        if start_idx == -1:
            break
        
        end_idx = start_idx + len(h1_tag) 
        
        positions.append((start_idx, end_idx))
        
        # Move start position forward to search for the next occurrence
        start_pos = end_idx + 1

    return h1_tag, positions


with open('../data/preprocessed_data_from_all_sitemaps_100000.csv', 'r', encoding='utf-8', newline='') as file:
    reader = csv.reader(file)
    for row in reader:
        row[1] = literal_eval(row[1])
        if row[1] is None:
            continue
        row[1] = (clean_text(row[1][0]), row[1][1], row[1][2]) # url, h1_tag_position, title, url_last_path, page_text = row
        row[2] = clean_text(row[2])
        row[3] = clean_text(row[3])
        row[4] = clean_text(row[4])
        
        h1_tag, positions = find_all_h1_positions(row[4], row[1][0])
        row[1] = (h1_tag, positions) # string with list
        if row[1] is None or row[1][1] is None or 'G Plan Chloe' in row[1]: # this website haunts me in my data with G Plan Chloe
            continue
        
        # finally we remove anything that has a len < 2
        if len(row[1][0].split()) < 2:
            continue
        
        ok = True
        for word in row[1][0].split():
            if word.lower() in BAD_TEXT_PATTERNS_IN_TITLE:
                ok = False
                break
        if not ok:
            continue    
        
        distinct_websites.add(get_base_url(row[0]))
        data.append(row)
        
print(len(distinct_websites))

288


### This method will be used for labeling the data automatically

I have decided to not label the title and the url last path in this version since they are quite inconsistent with the label I am looking for inside the text (i.e. the product name which is most of the time the h1 tag inside the text). That is what I consider the most accurate representation of the "product name" in the text.

In [80]:
def label_text(text, to_label):
    # Split the main text and the text to be labeled into tokens
    tokens = text.split()
    label_tokens = to_label.split()

    # Initialize the labels list with 'O' for 'Outside'
    labels = ['O'] * len(tokens)

    # Iterate through the tokens and label all occurrences of the label_tokens
    i = 0
    while i < len(tokens):
        # Convert both the token slice and label tokens to lowercase for comparison
        if [t.lower() for t in tokens[i:i+len(label_tokens)]] == [lt.lower() for lt in label_tokens]:
            labels[i] = 'B-PRODUCT'  # Mark the beginning of the label
            for j in range(1, len(label_tokens)):
                labels[i+j] = 'I-PRODUCT'  # Mark the rest of the label tokens
            i += len(label_tokens)  # Skip the tokens that were labeled
        else:
            i += 1  # Move to the next token

    return tokens, labels


In [81]:
def tokenize_and_label(text, h1_tag_position, title, url_last_path, tokens_left=10, tokens_right=25):
    
    to_label = h1_tag_position[0]
    
    tokens_url_last_path, labels_url_last_path = [], []
    tokens_title, labels_title = [], []
    
    if url_last_path is None or url_last_path == '':
        tokens_url_last_path = ['<NO_URL>']
        labels_url_last_path = ['O']
    elif isinstance(url_last_path, str):
        tokens_url_last_path, labels_url_last_path = label_text(url_last_path, to_label)
    else: 
        tokens_url_last_path = ['<NO_URL>']
        labels_url_last_path = ['O']

    if title is None or title == '':
        tokens_title = ['<NO_TITLE>']
        labels_title = ['O']
    elif isinstance(title, str):
        tokens_title, labels_title = label_text(title, to_label)
    else: 
        tokens_title = ['<NO_TITLE>']
        labels_title = ['O']
    
    tokens_text, labels_text = label_text(text, to_label)
    
    try:
        first_label_index = labels_text.index('B-PRODUCT')
    except ValueError:
        first_label_index = 0  # If no labeled entity, start from the beginning
    try:
        last_label_index = max(idx for idx, label in enumerate(labels_text) if label in ['B-PRODUCT', 'I-PRODUCT'])
    except ValueError:
        last_label_index = len(tokens_text) - 1  # If no labeled entity, end at the last token

    # Calculate the window to slice
    start_index = max(0, first_label_index - tokens_left)
    end_index = min(len(tokens_text), last_label_index + tokens_right + 1) # or len(tokens_text) 


    tokens_text = tokens_text[start_index:end_index]
    labels_text = labels_text[start_index:end_index]
    
    # tokens_text = tokens_text[:100]
    # labels_text = labels_text[:100]
    
    tokens = ['[URL]'] + tokens_url_last_path + ['[URL]', '[TITLE]'] + tokens_title + ['[TITLE]', '[TEXT]'] + tokens_text + ['[TEXT]']
    
    labels = ['O'] + labels_url_last_path + ['O', 'O'] + labels_title + ['O', 'O'] + labels_text + ['O']
    

    return tokens, labels


In [83]:
with open('../data/100000_data_ready_for_training.csv', 'w', encoding='utf-8', newline='') as file:
    writer = csv.writer(file)
    for row in data:
        url, h1_tag_positions, title, url_last_path, page_text = row
        
        tokens, labels = tokenize_and_label(page_text, h1_tag_positions, title, url_last_path)
        # tokens, labels = clean_tokens_with_labels(tokens, labels)
        tokens_str = ' '.join(tokens)  # Join tokens into a single string
        labels_str = ' '.join(labels)  # Join labels into a single string
        writer.writerow([url, tokens_str, labels_str])




In [46]:
preprocessed_data = []

with open('../data/100000_data_ready_for_training.csv', 'r', encoding='utf-8', newline='') as file:
    reader = csv.reader(file)
    for row in reader:
        url, tokens_str, labels_str = row
        tokens = tokens_str.split(' ') # !!!
        labels = labels_str.split(' ')
        preprocessed_data.append((url, tokens, labels))

In [48]:
print(data[9])
print(preprocessed_data[9])


['https://mulamu.com/products/iedden-container', ('DDEN CONTAINER', [(1193, 1207), (1208, 1222)]), 'DDEN CONTAINER - Mulamu Furnishings', 'iedden container', 'MEMBERS ENJOY A 10 DISCOUNT! Be A Member Now Menu Cart FURNITURE ARMCHAIRS BED FRAMES Bed side tables BENCH COFFEE TABLES CONSOLES DINING CHAIRS DINING TABLES OUTDOOR FURNITURE STOOLS & BAR STOOLS STORAGE & SHELF STUDY DESKS SOFAS OTHERS customization order IN-STOCK BED ACCS  MATTRESS BED ACCESSORIES DOUBLE ELEPHANT MATTRESS Sofzsleep mattress OTHER DECOS LIGHTS  FANS ALL LIGHTING ALL FANS PENDANT LIGHTS FLOOR LAMP TABLE LAMP Sale Clearance SALE DISPLAY ASIS Furniture Rental OTHERS My Account Continue Shopping P.S. Would you like to purchase our lifetime membership and instantly enjoy 10 off your current cart and all future purchases? Click here to find out more! Your Cart is Empty MEMBERS ENJOY A 10 DISCOUNT! Be A Member Now FURNITURE  ARMCHAIRS BED FRAMES Bed side tables BENCH COFFEE TABLES CONSOLES DINING CHAIRS DINING TABLES 