# Import

In [59]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index import SimpleDirectoryReader
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import re
import logging
import os
import requests
import nltk
from llama_index import Document
import pickle

# Add a new column for word counts
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bruno.Pinos\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [152]:
def count_tokens_html(h):
    soup = BeautifulSoup(h.split('\n', 1)[1], "lxml")
    h = soup.get_text()
    tokens = nltk.word_tokenize(h)
    num_tokens = len(tokens)
    return num_tokens

def count_tokens(text):
    tokens = nltk.word_tokenize(text)
    num_tokens = len(tokens)
    return num_tokens


def clean_text(text):
    # Removing unwanted characters like ¶
    text = text.replace('¶', '')
    return text


def read_local_html(file_path):
    if not os.path.exists(file_path):
        logging.warning(f"File not found: {file_path}")
        return ''
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def fetch_html_from_url(url):
    response = requests.get(url)
    response.encoding = 'utf-8'
    response.raise_for_status()
    return response.text


def get_path(html_content, h='h1'):
    soup = BeautifulSoup(html_content.split(f'</{h}>')[0], 'html.parser')
    h_text = soup.find(f'{h}').text.replace("¶", "").replace("Â", " ")
    return '/' + h_text
    
    
def scrape_content_from_page(html_content, source, file_path):
    h1_split = html_content.split('<h1')
    list_split = list()
    list_path = list()
    list_source = list()  # For storing the documentation source
    list_file_path = list()
    
    def get_html(h):
        # Look for href in the html content to add it to the link
        regex_pattern = r'<h[1-6](.*?)<\/h[1-6]>'
        header = re.search(regex_pattern, h)
        if header:
            regex_pattern = r'href="#([^"]*)"'
            match_href = re.search(regex_pattern, header.group(1))
            regex_pattern = r'id="([^"]*)"'
            match_id = re.search(regex_pattern, header.group(1))
            if match_href:
                href = '#' + match_href.group(1)
            elif match_id:
                href = '#' + match_id.group(1)
            else:
                href = ''
        else:
            href = ''
        # get the text from html content
        soup = BeautifulSoup(h.split('\n', 1)[1], "lxml")
        h = soup.get_text()
        list_split.append(h)
        list_path.append(path)
        list_source.append(source)  # Add the documentation source for each content
        list_file_path.append(f'{file_path}{href}')
        
    path = ''
    for i, h1 in enumerate(h1_split):
        if i > 0:
            h1 = '<h1' + h1
            result = path.split('/')[:1]
            path = '/'.join(result)
            path += get_path(h1, h='h1')
        if count_tokens_html(h1) <= 2048:
            get_html(h1)
        else:        
            h2_split = h1.split('<h2')
            for j, h2 in enumerate(h2_split):
                if j > 0:
                    h2 = '<h2' + h2
                    result = path.split('/')[:2]
                    path = '/'.join(result)
                    path += get_path(h2, h='h2')
                if count_tokens_html(h2) <= 2048:
                    get_html(h2)
                else:
                    h3_split = h2.split('<h3')
                    for k, h3 in enumerate(h3_split):
                        if k > 0:
                            h3 = '<h3' + h3
                            result = path.split('/')[:3]
                            path = '/'.join(result)
                            path += get_path(h3, h='h3')
                        if count_tokens_html(h3) <= 2048:
                            get_html(h3)
                        else:
                            h4_split = h3.split('<h4')
                            for l, h4 in enumerate(h4_split):
                                if l > 0:
                                    h4 = '<h4' + h4
                                    result = path.split('/')[:4]
                                    path = '/'.join(result)
                                    path += get_path(h4, h='h4')
                                get_html(h4)
    dataframe = pd.DataFrame()
    dataframe['documentation'] = list_source  # Add the documentation source column
    dataframe['path'] = list_path
    dataframe['text'] = list_split
    dataframe['file_path'] = list_file_path  # adding local path to the df for each row
    return dataframe


def process_links(site_config):
    index_content = fetch_html_from_url(site_config['index_path'])
    soup = BeautifulSoup(index_content, 'html.parser')
    links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.html')]

    logging.info(f"Found {len(links)} links to process for {site_config['documentation']}.")

    data_frames = []
    for link in links:
        full_url = site_config['base_url'] + link  # Construct the full URL
        try:
            page_content = fetch_html_from_url(full_url)
            df_temp = scrape_content_from_page(page_content, site_config['documentation'], full_url)

            # Check if df_temp is not empty or None
            if df_temp is not None and not df_temp.empty:
                data_frames.append(df_temp)
            else:
                logging.warning(f"Empty dataframe returned from {full_url}.")

        except Exception as e:
            logging.error(f"Error processing {full_url}: {e}")

    logging.info(f"Processed {len(data_frames)} dataframes for {site_config['documentation']}.")
    return data_frames


def process_links_from_list(links, documentation_label):
    logging.info(f"Found {len(links)} links to process for {documentation_label}.")

    data_frames = []
    for link in links:
        try:
            page_content = fetch_html_from_url(link)
            df_temp = scrape_content_from_page(page_content, documentation_label, link)

            # Check if df_temp is not empty or None
            if df_temp is not None and not df_temp.empty:
                data_frames.append(df_temp)
            else:
                logging.warning(f"Empty dataframe returned from {link}.")

        except Exception as e:
            logging.error(f"Error processing {link}: {e}")

    logging.info(f"Processed {len(data_frames)} dataframes for {documentation_label}.")
    return data_frames

def get_all_links(base_url):
    # Send a GET request to the base URL
    response = requests.get(base_url)
    response.raise_for_status()  # Raise an exception for HTTP errors

    # Parse the content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract all anchor tags
    anchors = soup.find_all('a', href=True)

    # Convert relative URLs and fragments to absolute URLs
    absolute_urls = [urljoin(base_url, a['href']) for a in anchors]

    return absolute_urls

# Set up 

In [61]:
# Configuration
config = {
    'site1': {
        'index_path': "http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/administration-manual-html/administration-manual-html/index.html",
        'base_path': "http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/administration-manual-html/administration-manual-html/",
        'documentation': 'administration-manual'
    },
    'site2': {
        'index_path': "http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/user-manual-html/user-manual-html/index.html",
        'base_path': "http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/user-manual-html/user-manual-html/",
        'documentation': 'user-manual'
    }
}

# Get links

In [62]:
site_urls = [details['index_path'] for details in config.values()]

In [63]:
site_urls

['http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/administration-manual-html/administration-manual-html/index.html',
 'http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/user-manual-html/user-manual-html/index.html']

In [113]:
a_urls = get_all_links(site_urls[0])
a_urls = [url for url in a_urls if url.startswith('http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/')]
u_urls = get_all_links(site_urls[1])
u_urls = [url for url in u_urls if url.startswith('http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/')]

In [114]:
a_urls[1], u_urls[1]

('http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/administration-manual-html/administration-manual-html/index.html',
 'http://192.168.48.22:8082/repository/skyminer-dev/skyminer-documentation/latest/user-manual-html/user-manual-html/index.html')

# Scrape the Content and Store in a DataFrame:

In [153]:
df1 = process_links_from_list(a_urls, "Administration Manual")

In [154]:
df2 = process_links_from_list(u_urls, "User Manual")

In [155]:
df1 = pd.concat(df1, ignore_index=True)
df2 = pd.concat(df2, ignore_index=True)

In [156]:
df = pd.concat([df1, df2], ignore_index=True)

In [157]:
df

Unnamed: 0,documentation,path,text,file_path
0,Administration Manual,,\n\n\n\nSkyminer Administration Guide — Skymin...,http://192.168.48.22:8082/repository/skyminer-...
1,Administration Manual,/Skyminer Administration Guide,This documentation contains the Skyminer admin...,http://192.168.48.22:8082/repository/skyminer-...
2,Administration Manual,,\n\n\n\nSkyminer Administration Guide — Skymin...,http://192.168.48.22:8082/repository/skyminer-...
3,Administration Manual,/Skyminer Administration Guide,This documentation contains the Skyminer admin...,http://192.168.48.22:8082/repository/skyminer-...
4,Administration Manual,,\n\n\n\nNotices regarding this document — Skym...,http://192.168.48.22:8082/repository/skyminer-...
...,...,...,...,...
952,User Manual,/FAQ,\nWhat is Skyminer?¶\nSkyminer system is a Big...,http://192.168.48.22:8082/repository/skyminer-...
953,User Manual,,\n\n\n\nFAQ — Skyminer User Manual - KC-153-UM...,http://192.168.48.22:8082/repository/skyminer-...
954,User Manual,/FAQ,\nWhat is Skyminer?¶\nSkyminer system is a Big...,http://192.168.48.22:8082/repository/skyminer-...
955,User Manual,,\n\n\n\nFAQ — Skyminer User Manual - KC-153-UM...,http://192.168.48.22:8082/repository/skyminer-...


# Clean data

In [158]:
# Apply this function to the entire 'content' column
df['text'] = df['text'].apply(clean_text)
df

Unnamed: 0,documentation,path,text,file_path
0,Administration Manual,,\n\n\n\nSkyminer Administration Guide — Skymin...,http://192.168.48.22:8082/repository/skyminer-...
1,Administration Manual,/Skyminer Administration Guide,This documentation contains the Skyminer admin...,http://192.168.48.22:8082/repository/skyminer-...
2,Administration Manual,,\n\n\n\nSkyminer Administration Guide — Skymin...,http://192.168.48.22:8082/repository/skyminer-...
3,Administration Manual,/Skyminer Administration Guide,This documentation contains the Skyminer admin...,http://192.168.48.22:8082/repository/skyminer-...
4,Administration Manual,,\n\n\n\nNotices regarding this document — Skym...,http://192.168.48.22:8082/repository/skyminer-...
...,...,...,...,...
952,User Manual,/FAQ,\nWhat is Skyminer?\nSkyminer system is a Big ...,http://192.168.48.22:8082/repository/skyminer-...
953,User Manual,,\n\n\n\nFAQ — Skyminer User Manual - KC-153-UM...,http://192.168.48.22:8082/repository/skyminer-...
954,User Manual,/FAQ,\nWhat is Skyminer?\nSkyminer system is a Big ...,http://192.168.48.22:8082/repository/skyminer-...
955,User Manual,,\n\n\n\nFAQ — Skyminer User Manual - KC-153-UM...,http://192.168.48.22:8082/repository/skyminer-...


## Remove the empty text

In [159]:
# Remove rows where the text is empty
df = df[df['text'].notnull() & (df['text'].str.len() > 0)]
# Remove rows where the text is empty or the path is an empty string
df = df[(df['path'].notnull()) & (df['path'] != "") & (df['text'].str.len() > 0)]
# Remove rows where path equals "/Table Of Contents"
df = df[df['path'] != '/Table Of Contents']
# Remove duplicates based on the 'text' column
df = df.drop_duplicates(subset='text', keep='first')

## Remove doc with not enough words 

In [160]:
# Example usage:
df['token_count'] = df['text'].apply(count_tokens)
df


Unnamed: 0,documentation,path,text,file_path,token_count
1,Administration Manual,/Skyminer Administration Guide,This documentation contains the Skyminer admin...,http://192.168.48.22:8082/repository/skyminer-...,698
5,Administration Manual,/Notices regarding this document,The information contained in this document is ...,http://192.168.48.22:8082/repository/skyminer-...,211
7,Administration Manual,/Skyminer Introduction,Skyminer system is a Big Data storage and anal...,http://192.168.48.22:8082/repository/skyminer-...,429
9,Administration Manual,/Installation,Contents:\n\n\nSkyminer Architecture\nIntroduc...,http://192.168.48.22:8082/repository/skyminer-...,122
11,Administration Manual,/Configuration,Contents:\n\n\nSharing Resources between diffe...,http://192.168.48.22:8082/repository/skyminer-...,115
...,...,...,...,...,...
888,User Manual,/Skyminer Time Series Python Connector,\nSkyminer Time Series Python Connector is a P...,http://192.168.48.22:8082/repository/skyminer-...,36
889,User Manual,/Skyminer Time Series Python Connector/Install,\npip install httplib2 pandas matplotlib\nCopy...,http://192.168.48.22:8082/repository/skyminer-...,69
890,User Manual,/Skyminer Time Series Python Connector/Quickstart,\nGet the datapoints of the last hour\nfrom Sk...,http://192.168.48.22:8082/repository/skyminer-...,186
891,User Manual,/Skyminer Time Series Python Connector/Modules,\nSTSAPI\nSTSAPI is the interface with the Sky...,http://192.168.48.22:8082/repository/skyminer-...,1463


In [161]:
df = df.sort_values(by='token_count', ascending=False)
lower_bound = 19  # or any other value based on your visualization or requirements
df = df[df['token_count'] > lower_bound]
df

Unnamed: 0,documentation,path,text,file_path,token_count
547,Administration Manual,/Skyminer document Module/Skyminer document mo...,\n\nGET /doc\ndescribes the document backend i...,http://192.168.48.22:8082/repository/skyminer-...,3704
528,Administration Manual,/Skyminer Time Series Indexer Module,The time series indexer module uses Opensearch...,http://192.168.48.22:8082/repository/skyminer-...,1812
750,User Manual,/User Interface,\nOverview\nThe correlation UI is a web client...,http://192.168.48.22:8082/repository/skyminer-...,1772
758,User Manual,/Correlation API,\nCorrelation Search API\nThe web client allow...,http://192.168.48.22:8082/repository/skyminer-...,1707
730,User Manual,/Predictors/Advanced,\nDLM\nAll the implemented models are designed...,http://192.168.48.22:8082/repository/skyminer-...,1705
...,...,...,...,...,...
693,User Manual,/Web interface/Feature search,The feature search allows you to:\n\nVisualize...,http://192.168.48.22:8082/repository/skyminer-...,29
27,Administration Manual,/Dependencies and licenses,\nNote\nThis list is only available in the HTM...,http://192.168.48.22:8082/repository/skyminer-...,26
91,Administration Manual,/Installing on a 2 nodes cluster,A 2-nodes cluster is an atypical setup.\nPleas...,http://192.168.48.22:8082/repository/skyminer-...,20
90,Administration Manual,/Installing on a standalone server,For a standalone server you will select the fo...,http://192.168.48.22:8082/repository/skyminer-...,20


## Split big documents

In [90]:
# upper_bound = 728
# 
# text_splitter = RecursiveCharacterTextSplitter(
#     # Set a tiny chunk size, just to show.
#     chunk_size=upper_bound,
#     chunk_overlap=40,
#     length_function=len,
#     separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
#     is_separator_regex=False
# )
# 
# # Loop through the DataFrame and split text if necessary
# final_df = df.copy()
# new_rows = list()
# for index, row in df.iterrows():
#     if row['token_count'] > upper_bound:
#         new_texts = text_splitter.create_documents([row['text']])
#         for new_text in new_texts:
#             new_text = new_text.page_content
#             new_row = {'documentation': row['documentation'],
#                        'path': row['path'],
#                        'file_path': row['file_path'],
#                        'text': new_text,
#                        'token_count': len(new_text.split())}
#             new_rows.append(new_row)
#             
# # Concatenate the new rows to the DataFrame
# final_df = pd.concat([final_df, pd.DataFrame(new_rows)], ignore_index=True, sort=False)
# final_df = final_df[final_df['token_count'] < upper_bound]

In [162]:
# Remove duplicated
duplicates = df[df.duplicated(subset='text', keep=False)]
df = df.drop(duplicates.index)
df

Unnamed: 0,documentation,path,text,file_path,token_count
547,Administration Manual,/Skyminer document Module/Skyminer document mo...,\n\nGET /doc\ndescribes the document backend i...,http://192.168.48.22:8082/repository/skyminer-...,3704
528,Administration Manual,/Skyminer Time Series Indexer Module,The time series indexer module uses Opensearch...,http://192.168.48.22:8082/repository/skyminer-...,1812
750,User Manual,/User Interface,\nOverview\nThe correlation UI is a web client...,http://192.168.48.22:8082/repository/skyminer-...,1772
758,User Manual,/Correlation API,\nCorrelation Search API\nThe web client allow...,http://192.168.48.22:8082/repository/skyminer-...,1707
730,User Manual,/Predictors/Advanced,\nDLM\nAll the implemented models are designed...,http://192.168.48.22:8082/repository/skyminer-...,1705
...,...,...,...,...,...
693,User Manual,/Web interface/Feature search,The feature search allows you to:\n\nVisualize...,http://192.168.48.22:8082/repository/skyminer-...,29
27,Administration Manual,/Dependencies and licenses,\nNote\nThis list is only available in the HTM...,http://192.168.48.22:8082/repository/skyminer-...,26
91,Administration Manual,/Installing on a 2 nodes cluster,A 2-nodes cluster is an atypical setup.\nPleas...,http://192.168.48.22:8082/repository/skyminer-...,20
90,Administration Manual,/Installing on a standalone server,For a standalone server you will select the fo...,http://192.168.48.22:8082/repository/skyminer-...,20


In [163]:
has_nan_or_none = df['path'].isnull().any()
print(f"Has NaN or None values in 'path': {has_nan_or_none}")
count_nan_or_none = df['path'].isnull().sum()
print(f"Number of NaN or None values in 'path': {count_nan_or_none}")
has_empty_string = (df['path'] == "").any()
print(f"Has empty strings in 'path': {has_empty_string}")
count_empty_string = (df['path'] == "").sum()
print(f"Number of empty strings in 'path': {count_empty_string}")

Has NaN or None values in 'path': False
Number of NaN or None values in 'path': 0
Has empty strings in 'path': False
Number of empty strings in 'path': 0


## Dataframe to list of document

In [164]:
def create_metadata_dict(row):
    metadata = dict()
    if len(row['path'].split('/')) > 1:
        metadata["title"] = row['path'].split('/')[1]
    if len(row['path'].split('/')) > 2:
        metadata["sub_title"] = row['path'].split('/')[2]
    if len(row['path'].split('/')) > 3:
        metadata["part"] = row['path'].split('/')[3]
    if len(row['path'].split('/')) > 4:
        metadata["chapter"] = row['path'].split('/')[4]
    if len(row['path'].split('/')) > 5:
        metadata["section"] = row['path'].split('/')[5]
    if len(row['path'].split('/')) > 6:
        metadata["paragraph"] = row['path'].split('/')[6]
    metadata["url"] = row['file_path']
    metadata["manual"] = row['documentation']
    return metadata

docs = list()
for _, row in df.iterrows():
    metadata = create_metadata_dict(row)
    new_doc = Document(
        text=row['text'],
        metadata=metadata,
        excluded_llm_metadata_keys=["url"],
        excluded_embed_metadata_keys = ["url"],
        metadata_seperator="::",
        metadata_template="{key}=>{value}",
        text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
    )
    docs.append(new_doc)

In [165]:
with open('skyminer_docs.pkl', 'wb') as f:
    pickle.dump(docs, f)

## PDF

In [167]:
reader = SimpleDirectoryReader(input_dir="./data/skyminer data", recursive=True)
docs = reader.load_data()
for doc in docs:
    if doc.metadata['file_path'] == 'data\\skyminer data\\skyminer-development.pdf':
        page = doc.metadata['page_label']
        if page.isnumeric():
            page = str(int(page) + 8)
        elif page == 'i':
            page = '3'
        elif page == 'ii':
            page = '4'
        elif page == 'iii':
            page = '5'
        elif page == 'iv':
            page = '6'
        elif page == 'v':
            page = '7'
        elif page == 'vi':
            page = '8'
        doc.metadata['url'] = ("http://192.168.48.22:8082/repository/skyminer-dev/dev-env/skyminer"
                               "-development.pdf#page=") + page
    else:
        docs = list()
with open('skyminer_docs.pkl', 'rb') as f:
    new_docs = pickle.load(f)
docs += new_docs

In [168]:
with open('skyminer_docs.pkl', 'wb') as f:
    pickle.dump(docs, f)