# Import

In [1]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import re
import logging
import os
import requests
from langchain.text_splitter import RecursiveCharacterTextSplitter
import nltk

In [2]:
def clean_text(text):
    # Removing unwanted characters like ¶
    text = text.replace('¶', '')
    return text


def read_local_html(file_path):
    if not os.path.exists(file_path):
        logging.warning(f"File not found: {file_path}")
        return ''
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def fetch_html_from_url(url):
    response = requests.get(url)
    response.encoding = 'utf-8'
    response.raise_for_status()
    return response.text


def get_path(html_content, h='h1'):
    html_content.split(f'</{h}>', 0)
    soup = BeautifulSoup(html_content.split(f'</{h}>')[0], 'html.parser')
    h_text = soup.find(f'{h}').text.replace("¶", "").replace("Â", " ")
    return '/' + h_text


def scrape_content_from_page(html_content, source, file_path):
    h1_split = html_content.split('<h1')
    list_split = list()
    list_path = list()
    list_source = list()  # For storing the documentation source
    list_file_path = list()
    path = ''

    for i, h1 in enumerate(h1_split):
        if i > 0:
            h1 = '<h1' + h1
            result = path.split('/')[:1]
            path = '/'.join(result)
            path += get_path(h1, h='h1')

        h2_split = h1.split('<h2')
        for j, h2 in enumerate(h2_split):
            if j > 0:
                h2 = '<h2' + h2
                result = path.split('/')[:2]
                path = '/'.join(result)
                path += get_path(h2, h='h2')
            h3_split = h2.split('<h3')
            for k, h3 in enumerate(h3_split):
                if k > 0:
                    h3 = '<h3' + h3
                    result = path.split('/')[:3]
                    path = '/'.join(result)
                    path += get_path(h3, h='h3')

                h4_split = h3.split('<h4')
                for l, h4 in enumerate(h4_split):
                    if l > 0:
                        h4 = '<h4' + h4
                        result = path.split('/')[:4]
                        path = '/'.join(result)
                        path += get_path(h4, h='h4')

                    h5_split = h4.split('<h5')
                    # Look for href in the html content to add it to the link
                    regex_pattern = r'<h[1-6](.*?)<\/h[1-6]>'
                    header = re.search(regex_pattern, h5_split[0])
                    if header:
                        regex_pattern = r'href="#([^"]*)"'
                        match_href = re.search(regex_pattern, header.group(1))
                        regex_pattern = r'id="([^"]*)"'
                        match_id = re.search(regex_pattern, header.group(1))
                        if match_href:
                            href = '#' + match_href.group(1)
                        elif match_id:
                            href = '#' + match_id.group(1)
                        else:
                            href = ''
                    else:
                        href = ''
                    # get the text from html content
                    soup = BeautifulSoup(h5_split[0].split('\n', 1)[1], "lxml")
                    h5_split = soup.get_text()
                    list_split.append(h5_split)
                    list_path.append(path)
                    list_source.append(source)  # Add the documentation source for each content
                    list_file_path.append(f'{file_path}{href}')
    dataframe = pd.DataFrame()
    dataframe['documentation'] = list_source  # Add the documentation source column
    dataframe['path'] = list_path
    dataframe['text'] = list_split
    dataframe['file_path'] = list_file_path  # adding local path to the df for each row
    return dataframe


def process_links(site_config):
    index_content = fetch_html_from_url(site_config['index_path'])
    soup = BeautifulSoup(index_content, 'html.parser')
    links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.html')]

    logging.info(f"Found {len(links)} links to process for {site_config['documentation']}.")

    data_frames = []
    for link in links:
        full_url = site_config['base_url'] + link  # Construct the full URL
        try:
            page_content = fetch_html_from_url(full_url)
            df_temp = scrape_content_from_page(page_content, site_config['documentation'], full_url)

            # Check if df_temp is not empty or None
            if df_temp is not None and not df_temp.empty:
                data_frames.append(df_temp)
            else:
                logging.warning(f"Empty dataframe returned from {full_url}.")

        except Exception as e:
            logging.error(f"Error processing {full_url}: {e}")

    logging.info(f"Processed {len(data_frames)} dataframes for {site_config['documentation']}.")
    return data_frames


def process_links_from_list(links, documentation_label):
    logging.info(f"Found {len(links)} links to process for {documentation_label}.")

    data_frames = []
    for link in links:
        try:
            page_content = fetch_html_from_url(link)
            df_temp = scrape_content_from_page(page_content, documentation_label, link)

            # Check if df_temp is not empty or None
            if df_temp is not None and not df_temp.empty:
                data_frames.append(df_temp)
            else:
                logging.warning(f"Empty dataframe returned from {link}.")

        except Exception as e:
            logging.error(f"Error processing {link}: {e}")

    logging.info(f"Processed {len(data_frames)} dataframes for {documentation_label}.")
    return data_frames

def get_all_links(base_url):
    # Send a GET request to the base URL
    response = requests.get(base_url)
    response.raise_for_status()  # Raise an exception for HTTP errors

    # Parse the content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract all anchor tags
    anchors = soup.find_all('a', href=True)

    # Convert relative URLs and fragments to absolute URLs
    absolute_urls = [urljoin(base_url, a['href']) for a in anchors]

    return absolute_urls

# Set up 

In [3]:
# Configuration
config = {
    'site1': {
        'index_path': "http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/administration-manual-html/administration-manual-html/index.html",
        'base_path': "http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/administration-manual-html/administration-manual-html/",
        'documentation': 'administration-manual'
    },
    'site2': {
        'index_path': "http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/user-manual-html/user-manual-html/index.html",
        'base_path': "http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/user-manual-html/user-manual-html/",
        'documentation': 'user-manual'
    }
}

# Get links

In [4]:
site_urls = [details['index_path'] for details in config.values()]

In [5]:
site_urls

['http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/administration-manual-html/administration-manual-html/index.html',
 'http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/user-manual-html/user-manual-html/index.html']

In [6]:
a_urls = get_all_links(site_urls[0])
u_urls = get_all_links(site_urls[1])

In [7]:
a_urls[1], u_urls[1]

('http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/administration-manual-html/administration-manual-html/index.html',
 'http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/user-manual-html/user-manual-html/index.html')

# Scrape the Content and Store in a DataFrame:

In [8]:
df1 = process_links_from_list(a_urls, "Administration Manual")

In [9]:
df2 = process_links_from_list(u_urls, "User Manual")

In [10]:
df1 = pd.concat(df1, ignore_index=True)
df2 = pd.concat(df2, ignore_index=True)

In [11]:
df = pd.concat([df1, df2], ignore_index=True)

In [12]:
df

Unnamed: 0,documentation,path,text,file_path
0,Administration Manual,,\n\nSpace Systems | Kratos\n\n\n\n\n\n\n\n\n\...,http://www.kratoscomms.com/
1,Administration Manual,/Leading Space Networks into the Future,\n,http://www.kratoscomms.com/
2,Administration Manual,/Revolutionary Satellite Ground Solutions,,http://www.kratoscomms.com/
3,Administration Manual,/Revolutionary Satellite Ground Solutions/Guid...,\n,http://www.kratoscomms.com/
4,Administration Manual,/Revolutionary Satellite Ground Solutions/Guid...,\nMainstreaming satellite operations to connec...,http://www.kratoscomms.com/
...,...,...,...,...
4284,User Manual,/FAQ/Is the data processed between the source ...,By default there is no process or filter appli...,http://192.168.48.22:8082/repository/skyminer-...
4285,User Manual,/FAQ/How much data can be displayed on Skyminer?,By default there are 3 safeguards for the size...,http://192.168.48.22:8082/repository/skyminer-...
4286,User Manual,/FAQ/How can I export data from Skyminer?,You can export the data to a JSON or CSV file ...,http://192.168.48.22:8082/repository/skyminer-...
4287,User Manual,/FAQ/How do I align time series to time bounda...,Using aggregators: this can be achieved by sel...,http://192.168.48.22:8082/repository/skyminer-...


# Clean data

In [13]:
# Apply this function to the entire 'content' column
df['text'] = df['text'].apply(clean_text)
df

Unnamed: 0,documentation,path,text,file_path
0,Administration Manual,,\n\nSpace Systems | Kratos\n\n\n\n\n\n\n\n\n\...,http://www.kratoscomms.com/
1,Administration Manual,/Leading Space Networks into the Future,\n,http://www.kratoscomms.com/
2,Administration Manual,/Revolutionary Satellite Ground Solutions,,http://www.kratoscomms.com/
3,Administration Manual,/Revolutionary Satellite Ground Solutions/Guid...,\n,http://www.kratoscomms.com/
4,Administration Manual,/Revolutionary Satellite Ground Solutions/Guid...,\nMainstreaming satellite operations to connec...,http://www.kratoscomms.com/
...,...,...,...,...
4284,User Manual,/FAQ/Is the data processed between the source ...,By default there is no process or filter appli...,http://192.168.48.22:8082/repository/skyminer-...
4285,User Manual,/FAQ/How much data can be displayed on Skyminer?,By default there are 3 safeguards for the size...,http://192.168.48.22:8082/repository/skyminer-...
4286,User Manual,/FAQ/How can I export data from Skyminer?,You can export the data to a JSON or CSV file ...,http://192.168.48.22:8082/repository/skyminer-...
4287,User Manual,/FAQ/How do I align time series to time bounda...,Using aggregators: this can be achieved by sel...,http://192.168.48.22:8082/repository/skyminer-...


## Remove the empty text

In [14]:
# Remove rows where the text is empty
df = df[df['text'].notnull() & (df['text'].str.len() > 0)]
# Remove rows where the text is empty or the path is an empty string
df = df[(df['path'].notnull()) & (df['path'] != "") & (df['text'].str.len() > 0)]
# Remove rows where path equals "/Table Of Contents"
df = df[df['path'] != '/Table Of Contents']
# Remove duplicates based on the 'text' column
df = df.drop_duplicates(subset='text', keep='first')
# Find rows with duplicate content
duplicates = df[df.duplicated(subset='text', keep=False)]
duplicates

Unnamed: 0,documentation,path,text,file_path


## Remove doc with not enough words 

In [15]:
# Add a new column for word counts
nltk.download('punkt')

def count_tokens(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Count the number of tokens
    num_tokens = len(tokens)
    return num_tokens

# Example usage:
df['token_count'] = df['text'].apply(count_tokens)
df


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bruno.Pinos\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,documentation,path,text,file_path,token_count
1,Administration Manual,/Leading Space Networks into the Future,\n,http://www.kratoscomms.com/,0
4,Administration Manual,/Revolutionary Satellite Ground Solutions/Guid...,\nMainstreaming satellite operations to connec...,http://www.kratoscomms.com/,10
5,Administration Manual,/Revolutionary Satellite Ground Solutions/Guid...,"\nAdvancing secure, resilient communication cr...",http://www.kratoscomms.com/,9
6,Administration Manual,/Revolutionary Satellite Ground Solutions/Guid...,\nCreating infrastructure and systems to suppo...,http://www.kratoscomms.com/,10
7,Administration Manual,/Technology for the Satellite Ground Ecosystem,Kratos develops advanced satellite communicati...,http://www.kratoscomms.com/,16
...,...,...,...,...,...
3939,User Manual,/Skyminer Time Series Python Connector/Modules...,The Bin grouper groups data point values into ...,http://192.168.48.22:8082/repository/skyminer-...,108
3940,User Manual,/Skyminer Time Series Python Connector/Modules...,You can group results by specifying one or mor...,http://192.168.48.22:8082/repository/skyminer-...,100
3941,User Manual,/Skyminer Time Series Python Connector/Modules...,The time grouper groups results by time ranges...,http://192.168.48.22:8082/repository/skyminer-...,152
3942,User Manual,/Skyminer Time Series Python Connector/Modules...,The value grouper groups by data point values....,http://192.168.48.22:8082/repository/skyminer-...,76


In [16]:
df = df.sort_values(by='token_count', ascending=False)
lower_bound = 11  # or any other value based on your visualization or requirements
df = df[df['token_count'] > lower_bound]
df

Unnamed: 0,documentation,path,text,file_path,token_count
2045,Administration Manual,/Skyminer document Module/Skyminer document mo...,\n\nGET /doc\ndescribes the document backend i...,http://192.168.48.22:8082/repository/skyminer-...,3704
3463,User Manual,/Predictors/Advanced/DLM,All the implemented models are designed to ext...,http://192.168.48.22:8082/repository/skyminer-...,1537
1667,Administration Manual,/OpenAPI,\n\nPOST /skyminer/correlations/search\nSubmit...,http://192.168.48.22:8082/repository/skyminer-...,1460
1969,Administration Manual,/Skyminer Time Series Indexer Module/Skyminer ...,\n\nGET /skyminer/ts-index/enable\nEnable the ...,http://192.168.48.22:8082/repository/skyminer-...,1080
1759,Administration Manual,/Features/Response,Success\nReturns 200 when successful.\n{\n ...,http://192.168.48.22:8082/repository/skyminer-...,858
...,...,...,...,...,...
1688,Administration Manual,/Aggregators/Count,count\n\nCounts the number of data points.\nEx...,http://192.168.48.22:8082/repository/skyminer-...,12
1872,Administration Manual,/Skyminer Time Series Indexer Module API/Rebui...,Start a job to rebuild the index entirely for ...,http://192.168.48.22:8082/repository/skyminer-...,12
3308,User Manual,/Reporting/Overview/How to use skyminer BIRT R...,"To use Skyminer BIRT Reports, start the BIRT R...",http://192.168.48.22:8082/repository/skyminer-...,12
3888,User Manual,/Skyminer extensions/Skyminer WEBUI integratio...,The address of the server is saved in a variab...,http://192.168.48.22:8082/repository/skyminer-...,12


## Split big documents

In [17]:
upper_bound = 728

text_splitter = RecursiveCharacterTextSplitter(
    # Set a tiny chunk size, just to show.
    chunk_size=upper_bound,
    chunk_overlap=40,
    length_function=len,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
    is_separator_regex=False
)

# Loop through the DataFrame and split text if necessary
final_df = df.copy()
new_rows = list()
for index, row in df.iterrows():
    if row['token_count'] > upper_bound:
        new_texts = text_splitter.create_documents([row['text']])
        for new_text in new_texts:
            new_text = new_text.page_content
            new_row = {'documentation': row['documentation'],
                       'path': row['path'],
                       'file_path': row['file_path'],
                       'text': new_text,
                       'token_count': len(new_text.split())}
            new_rows.append(new_row)
            
# Concatenate the new rows to the DataFrame
final_df = pd.concat([final_df, pd.DataFrame(new_rows)], ignore_index=True, sort=False)
final_df = final_df[final_df['token_count'] < upper_bound]
final_df

Unnamed: 0,documentation,path,text,file_path,token_count
6,Administration Manual,/Query Metrics/Metric Properties,\nnameThe name of the metric(s) to return data...,http://192.168.48.22:8082/repository/skyminer-...,726
7,Administration Manual,/Skyminer Administration Guide,This documentation contains the Skyminer admin...,http://192.168.48.22:8082/repository/skyminer-...,698
8,User Manual,/Metrics/Description,\n\n\n\nThe + button allows you to add a new e...,http://192.168.48.22:8082/repository/skyminer-...,657
9,Administration Manual,/Mapped directories (volumes)/List of mapped v...,\nSkyminer\n\n<installation_directory>/skymine...,http://192.168.48.22:8082/repository/skyminer-...,622
10,Administration Manual,/Correlations API/Correlations Search API/Result,The result is divided into two parts:\n\nquery...,http://192.168.48.22:8082/repository/skyminer-...,594
...,...,...,...,...,...
625,User Manual,/Skyminer Time Series Python Connector/Example...,4 1563814800000 346524.700000 [skyminer_cl...,http://192.168.48.22:8082/repository/skyminer-...,50
626,User Manual,/Skyminer Time Series Python Connector/Example...,9 1563832800000 321140.000000 [skyminer_cl...,http://192.168.48.22:8082/repository/skyminer-...,50
627,User Manual,/Skyminer Time Series Python Connector/Example...,14 1563850800000 305792.500000 [skyminer_cl...,http://192.168.48.22:8082/repository/skyminer-...,50
628,User Manual,/Skyminer Time Series Python Connector/Example...,19 1563868800000 373827.616667 [skyminer_cl...,http://192.168.48.22:8082/repository/skyminer-...,20


In [18]:
has_nan_or_none = df['path'].isnull().any()
print(f"Has NaN or None values in 'path': {has_nan_or_none}")
count_nan_or_none = df['path'].isnull().sum()
print(f"Number of NaN or None values in 'path': {count_nan_or_none}")


Has NaN or None values in 'path': False
Number of NaN or None values in 'path': 0


In [19]:
has_empty_string = (df['path'] == "").any()
print(f"Has empty strings in 'path': {has_empty_string}")
count_empty_string = (df['path'] == "").sum()
print(f"Number of empty strings in 'path': {count_empty_string}")

Has empty strings in 'path': False
Number of empty strings in 'path': 0
