In [None]:
from llama_index import ServiceContext
from langchain.llms import OpenAI
from langchain.embeddings import SentenceTransformerEmbeddings
from llama_index import SimpleDirectoryReader
from urllib.parse import urlparse, unquote
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import re
import logging
import os
import requests
import nltk
from llama_index import Document
import pickle

In [None]:
# Suppress only the InsecureRequestWarning from urllib3 needed for ignoring SSL warnings
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)

def get_all_links_recursive(url, depth=3):
    if depth == 0:
        return []
    try:
        response = requests.get(url, verify=True)  # Ignore SSL certificate verification
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('a', href=True)
        absolute_links = [urljoin(url, link['href']) for link in links]
        # Print links for the current page
        print(f"Links from {url}:")
        for link in absolute_links:
            print(link)
        # Recursively get links from child pages
        child_links = []
        for link in absolute_links:
            try:
                child_links.extend(get_all_links_recursive(link, depth=depth-1))
            except requests.exceptions.RequestException as e:
                print(f"Error retrieving content from {link}: {e}")

        return absolute_links + child_links
    except requests.exceptions.RequestException as e:
        print(f"Error retrieving content from {url}: {e}")
        return []


def remove_pdf_duplicates(file_list):
    removed_list = []
    for file_name in file_list:
        file_name = file_name.rsplit('.', 1)[0]
        # Check if the file has both ".html" and ".pdf" versions
        pdf_version = f"{file_name}.pdf"
        html_version = f"{file_name}.html"

        if pdf_version in file_list and html_version in file_list:
            # Remove the ".pdf" version
            file_list.remove(pdf_version)
            removed_list.append(pdf_version)
    return file_list, removed_list


def download_document(url, destination):
    try:
        # Extract the directory path from the destination
        directory = os.path.dirname(destination)

        # Create the directory if it doesn't exist
        if not os.path.exists(directory):
            os.makedirs(directory)
        response = requests.get(url)
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            with open(destination, 'wb') as file:
                file.write(response.content)
            print(f"Document downloaded successfully to {destination}")
        else:
            print(f"Failed to download document. Status code: {response.status_code}")
    except Exception as e:
        print(f"An error occurred: {e}")

def create_directory_path(url, base_path):
    # Parse the URL
    parsed_url = urlparse(url)
    # Extract the path component and decode URL-encoded characters
    path_components = unquote(parsed_url.path).split('/')
    # Combine the base path with the path components
    directory_path = os.path.join(base_path, *path_components)
    return directory_path


def filter_urls(urls, list_ext):
    filtered_urls = [url for url in urls if 'gitlab' not in url]
    filtered_urls = [url for url in filtered_urls if url.startswith('http://qms-toulouse.kratos.us/')]
    filtered_urls = [url for url in filtered_urls if any(ext in url for ext in list_ext)]
    return filtered_urls


def clean_text(text):
    # Removing unwanted characters like ¶
    text = text.replace('¶', '')
    return text


def read_local_html(file_path):
    if not os.path.exists(file_path):
        logging.warning(f"File not found: {file_path}")
        return ''
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def fetch_html_from_url(url):
    response = requests.get(url)
    response.encoding = 'utf-8'
    response.raise_for_status()
    return response.text


def get_path(html_content, h='h1'):
    html_content.split(f'</{h}>', 0)
    soup = BeautifulSoup(html_content.split(f'</{h}>')[0], 'html.parser')
    h_text = soup.find(f'{h}').text.replace("¶", "").replace("Â", " ")
    return '/' + h_text


def scrape_content_from_page(html_content, source, file_path):
    h1_split = html_content.split('<h1')
    list_split = list()
    list_path = list()
    list_source = list()  # For storing the documentation source
    list_file_path = list()
    path = ''

    for i, h1 in enumerate(h1_split):
        if i > 0:
            h1 = '<h1' + h1
            result = path.split('/')[:1]
            path = '/'.join(result)
            path += get_path(h1, h='h1')

        h2_split = h1.split('<h2')
        for j, h2 in enumerate(h2_split):
            if j > 0:
                h2 = '<h2' + h2
                result = path.split('/')[:2]
                path = '/'.join(result)
                path += get_path(h2, h='h2')
            h3_split = h2.split('<h3')
            for k, h3 in enumerate(h3_split):
                if k > 0:
                    h3 = '<h3' + h3
                    result = path.split('/')[:3]
                    path = '/'.join(result)
                    path += get_path(h3, h='h3')

                h4_split = h3.split('<h4')
                for l, h4 in enumerate(h4_split):
                    if l > 0:
                        h4 = '<h4' + h4
                        result = path.split('/')[:4]
                        path = '/'.join(result)
                        path += get_path(h4, h='h4')

                    h5_split = h4.split('<h5')
                    # Look for href in the html content to add it to the link
                    regex_pattern = r'<h[1-6](.*?)<\/h[1-6]>'
                    header = re.search(regex_pattern, h5_split[0])
                    if header:
                        regex_pattern = r'href="#([^"]*)"'
                        match_href = re.search(regex_pattern, header.group(1))
                        regex_pattern = r'id="([^"]*)"'
                        match_id = re.search(regex_pattern, header.group(1))
                        if match_href:
                            href = '#' + match_href.group(1)
                        elif match_id:
                            href = '#' + match_id.group(1)
                        else:
                            href = ''
                    else:
                        href = ''
                    # get the text from html content
                    soup = BeautifulSoup(h5_split[0].split('\n', 1)[1], "lxml")
                    h5_split = soup.get_text()
                    list_split.append(h5_split)
                    list_path.append(path)
                    list_source.append(source)  # Add the documentation source for each content
                    list_file_path.append(f'{file_path}{href}')
    dataframe = pd.DataFrame()
    dataframe['documentation'] = list_source  # Add the documentation source column
    dataframe['path'] = list_path
    dataframe['text'] = list_split
    dataframe['file_path'] = list_file_path  # adding local path to the df for each row
    return dataframe


def process_links_from_list(links, documentation_label):
    logging.info(f"Found {len(links)} links to process for {documentation_label}.")
    data_frames = []
    for link in links:
        try:
            page_content = fetch_html_from_url(link)
            df_temp = scrape_content_from_page(page_content, documentation_label, link)

            # Check if df_temp is not empty or None
            if df_temp is not None and not df_temp.empty:
                data_frames.append(df_temp)
            else:
                logging.warning(f"Empty dataframe returned from {link}.")

        except Exception as e:
            logging.error(f"Error processing {link}: {e}")

    logging.info(f"Processed {len(data_frames)} dataframes for {documentation_label}.")
    return data_frames

# All links

In [None]:
qms_path = "http://qms-toulouse.kratos.us/"
qms_link = get_all_links_recursive(qms_path, depth=3)
qms_link = list(set(qms_link))
len(qms_link)

In [None]:
filtered_files, removed_list = remove_pdf_duplicates(qms_link)
for file in removed_list:
    print(file)
print(len(filtered_files))

In [None]:
all_links = list(set(filtered_files))
print(len(all_links))

# Lists to store categorized links
html_links = []
pdf_links = []
xlsx_links = []
xls_links = []
docx_links = []
doc_links = []
pptx_links = []
zip_links = []
xlsm_links = []

# Categorize links
for link in all_links:
    if link.endswith('.html'):
        html_links.append(link)
    elif link.endswith('.pdf'):
        pdf_links.append(link)
    elif link.endswith('.xlsx'):
        xlsx_links.append(link)
    elif link.endswith('.xlsm'):
        xlsm_links.append(link)
    elif link.endswith('.xls'):
        xls_links.append(link)
    elif link.endswith('.docx'):
        docx_links.append(link)
    elif link.endswith('.doc'):
        doc_links.append(link)
    elif link.endswith('.pptx'):
        pptx_links.append(link)
    elif link.endswith('.zip'):
        zip_links.append(link)

list_links = [html_links, pdf_links, xlsx_links, xlsm_links, xls_links, docx_links, doc_links, pptx_links, zip_links]
for extension in list_links:
    print(len(extension))

## PDF and DOCX

In [None]:
list_extension = ['.pdf']

# Example usage
pdf_doc_links = filter_urls(all_links, list_extension)
print(len(pdf_doc_links))
pdf_doc_links

In [None]:
for url in pdf_doc_links:
    current_path = os.getcwd()
    destination_path = create_directory_path(url.split('http://qms-toulouse.kratos.us/')[-1], base_path= current_path + "\\data\\qms data")
    download_document(url, destination_path)

In [None]:
reader = SimpleDirectoryReader(input_dir="./data/qms data", recursive=True)
docs = reader.load_data()

In [None]:
remove_list = []
for doc in docs:
    if len(doc.text) < 100:
        remove_list.append(doc)
    doc.metadata['url'] = 'http://qms-toulouse.kratos.us/' + doc.metadata['file_path'][5:].replace('\\', '/')
    doc.excluded_llm_metadata_keys.remove('file_name')
    doc.excluded_embed_metadata_keys.remove('file_name')
    doc.metadata['manual'] = "QMS Documents"
docs_pdf = [item for item in docs if item not in remove_list]
docs_pdf

In [None]:
for file in remove_list:
    print(file.text)
print(len(remove_list))
print(len(docs_pdf))

## HTML

In [None]:
list_extension = ['.html']

# Example usage
html_links = filter_urls(all_links, list_extension)
html_links =  [link for link in html_links if '#' not in link]
print(len(html_links))
html_links

In [None]:
df = process_links_from_list(html_links, "QMS Documents")
df = pd.concat(df, ignore_index=True)
df

In [None]:
# Apply this function to the entire 'content' column
df['text'] = df['text'].apply(clean_text)
# Remove rows where the text is empty
df = df[df['text'].notnull() & (df['text'].str.len() > 0)]
# Remove rows where the text is empty or the path is an empty string
df = df[(df['path'].notnull()) & (df['path'] != "") & (df['text'].str.len() > 0)]
# Remove rows where path equals "/Table Of Contents"
df = df[df['path'] != '/Table Of Contents']
# Remove duplicates based on the 'text' column
df = df.drop_duplicates(subset='text', keep='first')
df


In [None]:
# Add a new column for word counts
nltk.download('punkt')

def count_tokens(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Count the number of tokens
    num_tokens = len(tokens)
    return num_tokens

df['token_count'] = df['text'].apply(count_tokens)

df = df.sort_values(by='token_count', ascending=False)
lower_bound = 11  # or any other value based on your visualization or requirements
df = df[df['token_count'] > lower_bound]
df

In [None]:
has_nan_or_none = df['path'].isnull().any()
print(f"Has NaN or None values in 'path': {has_nan_or_none}")
count_nan_or_none = df['path'].isnull().sum()
print(f"Number of NaN or None values in 'path': {count_nan_or_none}")
has_empty_string = (df['path'] == "").any()
print(f"Has empty strings in 'path': {has_empty_string}")
count_empty_string = (df['path'] == "").sum()
print(f"Number of empty strings in 'path': {count_empty_string}")

In [None]:
def create_metadata_dict(row):
    metadata = dict()
    if len(row['path'].split('/')) > 1:
        metadata["title"] = row['path'].split('/')[1]
    if len(row['path'].split('/')) > 2:
        metadata["sub_title"] = row['path'].split('/')[2]
    if len(row['path'].split('/')) > 3:
        metadata["part"] = row['path'].split('/')[3]
    if len(row['path'].split('/')) > 4:
        metadata["chapter"] = row['path'].split('/')[4]
    if len(row['path'].split('/')) > 5:
        metadata["section"] = row['path'].split('/')[5]
    if len(row['path'].split('/')) > 6:
        metadata["paragraph"] = row['path'].split('/')[6]
    metadata["url"] = row['file_path']
    metadata["manual"] = row['documentation']
    return metadata

docs_html = list()
for _, row in df.iterrows():
    metadata = create_metadata_dict(row)
    new_doc = Document(
        text=row['text'],
        metadata=metadata,
        excluded_llm_metadata_keys=["url"],
        excluded_embed_metadata_keys = ["url"],
        metadata_seperator="::",
        metadata_template="{key}=>{value}",
        text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
    )
    docs_html.append(new_doc)
len(docs_html)

In [ ]:
with open('docs_pdf.pkl', 'wb') as f:
    pickle.dump(docs_pdf, f)

In [None]:
docs = docs_pdf + docs_html
len(docs)

In [None]:
with open('docs.pkl', 'wb') as f:
    pickle.dump(docs, f)

In [None]:
with open('docs_pdf.pkl', 'wb') as f:
    pickle.dump(docs_pdf, f)