# Import

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import html
import re
import logging
import os

# plot
import matplotlib.pyplot as plt
import seaborn as sns

# Chroma
import chromadb 
from chromadb.utils import embedding_functions
from langchain.vectorstores import Chroma

# Sentence Transformers
from sentence_transformers import SentenceTransformer

In [None]:
# Initialize logging
logging.basicConfig(level=logging.INFO)

# Set up 

In [None]:
# Configuration
config = {
    'site1': {
        'index_path': "http://192.168.48.101/jupyter/view/VA_project/extracted_content_A_manual/administration-manual-html/index.html",
        'base_path': "http://192.168.48.101/jupyter/tree/VA_project/extracted_content_A_manual/administration-manual-html",
        'documentation': 'administration-manual'
    },
    'site2': {
        'index_path': "http://192.168.48.101/jupyter/view/VA_project/extracted_content_U_manual/user-manual-html/index.html",
        'base_path': "http://192.168.48.101/jupyter/tree/VA_project/extracted_content_U_manual/user-manual-html",
        'documentation': 'user-manual'
    }
}

In [None]:
# Function to read the local HTML file
def read_local_html(file_path):
    if not os.path.exists(file_path):
        logging.warning(f"File not found: {file_path}")
        return ''
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [None]:
def get_path(html_content, h='h1'):
    html_content.split(f'</{h}>', 0)
    soup = BeautifulSoup(html_content.split(f'</{h}>')[0], 'html.parser')
    h_text = soup.find(f'{h}').text.replace("¶", "")
    return '/' + h_text

In [None]:
def scrape_content_from_page(html_content, source, file_path):
    h1_split = html_content.split('<h1')
    list_split = list()
    list_path = list()
    list_source = list()  # For storing the documentation source
    path = ''

    for i, h1 in enumerate(h1_split):
        if i > 0:
            h1 = '<h1' + h1
            result = path.split('/')[:1] 
            path = '/'.join(result)
            path += get_path(h1, h='h1')      

        h2_split = h1.split('<h2')
        for j, h2 in enumerate(h2_split):
            if j > 0:
                h2 = '<h2' + h2
                result = path.split('/')[:2] 
                path = '/'.join(result)
                path += get_path(h2, h='h2')
                
            h3_split = h2.split('<h3')
            for k, h3 in enumerate(h3_split):
                if k > 0:
                    h3 = '<h3' + h3
                    result = path.split('/')[:3] 
                    path = '/'.join(result)
                    path += get_path(h3, h='h3')

                h4_split = h3.split('<h4')
                for l, h4 in enumerate(h4_split):
                    if l > 0: 
                        h4 = '<h4' + h4
                        result = path.split('/')[:4] 
                        path = '/'.join(result)
                        path += get_path(h4, h='h4')    

                    h5_split = h4.split('<h5') 
                    soup = BeautifulSoup(h5_split[0].split('\n', 1)[1], "lxml")
                    h5_split = soup.get_text().strip('\n')
                    list_split.append(h5_split) 
                    list_path.append(path)
                    list_source.append(source)  # Add the documentation source for each content

    df = pd.DataFrame()
    df['documentation'] = list_source  # Add the documentation source column
    df['path'] = list_path
    df['text'] = list_split
    df['file_path'] = file_path  # adding local path to the df for each row
    return df

In [None]:
def process_links(site_config):
    index_content = read_local_html(site_config['index_path'])
    soup = BeautifulSoup(index_content, 'html.parser')
    links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.html')]
    
    logging.info(f"Found {len(links)} links to process for {site_config['documentation']}.")
    
    data_frames = []
    for link in links:
        full_path = os.path.join(site_config['base_path'], link)  # Using os.path.join
        try:
            page_content = read_local_html(full_path)
            df_temp = scrape_content_from_page(page_content, site_config['documentation'], full_path)
            
            # Check if df_temp is not empty or None
            if df_temp is not None and not df_temp.empty:
                data_frames.append(df_temp)
            else:
                logging.warning(f"Empty dataframe returned from {full_path}.")
            
        except Exception as e:
            logging.error(f"Error processing {full_path}: {e}")
    
    logging.info(f"Processed {len(data_frames)} dataframes for {site_config['documentation']}.")
    return data_frames


# Scrape the Content and Store in a DataFrame:

In [None]:
print(config)

In [None]:
for key, site_config in config.items():
    result = process_links(site_config)
    print(f"For {key}, process_links returns:", result)
    all_data_frames.extend(result)

In [None]:
all_data_frames = []
for key, site_config in config.items():
    all_data_frames.extend(process_links(site_config))

df = pd.concat(all_data_frames, ignore_index=True)

# Turn to DF

In [None]:
df

In [None]:
df.iloc[0,2]

In [None]:
df.iloc[89,2]

In [None]:
df.iloc[336,2]

# Clean data

In [None]:
def clean_text(text):
    # Whitespace normalization
    text = ' '.join(text.split())
    
    # Handling newline characters to create a visual separation
    text = text.replace('\n', ' NEWLINE ')

    # Handle LaTeX math expressions: This step can be enhanced further based on specific needs
    # The idea here is to isolate math expressions so that they don't get affected by other cleaning operations
    math_expressions = re.findall(r'\\\(.*?\\\)', text)
    for math_expr in math_expressions:
        placeholder = math_expr.replace(' ', '_')
        text = text.replace(math_expr, placeholder)

    # Removing unwanted characters like ¶
    text = text.replace('¶', '')

    # Punctuation spacing: Ensure spaces before and after punctuation
    text = re.sub(r'(?<=[\w])([.,;:!?\(\)\[\]])', r' \1', text)
    text = re.sub(r'([.,;:!?\(\)\[\]])(?=[\w])', r'\1 ', text)

    # Replace back the LaTeX math expressions
    for math_expr in math_expressions:
        placeholder = math_expr.replace(' ', '_')
        text = text.replace(placeholder, math_expr)

    
    return text

In [None]:
# Apply this function to the entire 'content' column
df['text'] = df['text'].apply(clean_text)

In [None]:
df['text'].iloc[0]

In [None]:
df['text'].iloc[89]

In [None]:
df['text'].iloc[336]

In [None]:
df['text'].iloc[400]

# Format size for emebddings

In [None]:
df['word_count'] = df['text'].apply(lambda x: len(x.split()))

In [None]:
df.iloc[336]

In [None]:
# Set Seaborn style
sns.set_style('whitegrid')

plt.figure(figsize=(10, 6))
sns.histplot(df['word_count'], bins=50, kde=True, color='skyblue')
plt.title('Distribution of Word Counts in Content')
plt.xlabel('Word Count')
plt.ylabel('Number of Entries')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['word_count'], bins=50, kde=True, color='skyblue')
plt.xscale('log')
plt.title('Distribution of Word Counts in Content (Logarithmic Scale)')
plt.xlabel('Word Count (Log Scale)')
plt.ylabel('Number of Entries')
plt.show()

## Remove the empty text

In [None]:
# Remove rows where the text is empty
df = df[df['text'].notnull() & (df['text'].str.len() > 0)]

In [None]:
# Remove rows where the text is empty or the path is an empty string
df = df[(df['path'].notnull()) & (df['path'] != "") & (df['text'].str.len() > 0)]


In [None]:
# Remove rows where path equals "/Table Of Contents"
df = df[df['path'] != '/Table Of Contents']

In [None]:
# Remove duplicates based on the 'text' column
df = df.drop_duplicates(subset='text', keep='first')

In [None]:
# Find rows with duplicate content
duplicates = df[df.duplicated(subset='text', keep=False)]
duplicates

In [None]:
# Add a new column for word counts
df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))

# Plot a histogram of word counts
plt.hist(df['word_count'], bins=50, edgecolor='k')
plt.title('Distribution of Word Counts')
plt.xlabel('Word Count')
plt.ylabel('Number of Documents')
plt.show()


In [None]:
sorted_df = df.sort_values(by='word_count', ascending=False)


# clean

In [None]:
threshold = 20  # or any other value based on your visualization or requirements
smol = df[df['word_count'] < threshold]
final_df = df[df['word_count'] > threshold]


In [None]:
smol

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


In [None]:
stop_words = set(stopwords.words('english'))

def stopword_percentage(text):
    if not text or not isinstance(text, str):
        return 0
    
    words = text.split()
    if not words:
        return 0
    
    stopword_count = sum(1 for word in words if word.lower() in stop_words)
    return (stopword_count / len(words)) * 100


In [None]:
smol = smol.copy()  # Making a copy of your DataFrame for this operation
smol['score'] = smol['text'].apply(stopword_percentage)


In [None]:
smol = smol.sort_values(by='score', ascending=False)
smol

In [None]:
plt.figure(figsize=(12, 6))
sns.scatterplot(data=smol, x='word_count', y='score', alpha=0.6, edgecolor=None)

plt.title('Number of Words vs. Stopword Percentage')
plt.xlabel('Number of Words')
plt.ylabel('Stopword Percentage (%)')

plt.show()


# Final preview

In [None]:
final_df

In [None]:
has_nan_or_none = df['path'].isnull().any()
print(f"Has NaN or None values in 'path': {has_nan_or_none}")
count_nan_or_none = df['path'].isnull().sum()
print(f"Number of NaN or None values in 'path': {count_nan_or_none}")


In [None]:
has_empty_string = (df['path'] == "").any()
print(f"Has empty strings in 'path': {has_empty_string}")
count_empty_string = (df['path'] == "").sum()
print(f"Number of empty strings in 'path': {count_empty_string}")


# Store in Chroma

In [None]:
chroma_client = client = chromadb.PersistentClient(path='C:/Users/Nathan/Kratos_data-Science/Chroma/v4')

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2',  device='cuda'#cpu)

In [None]:
# Initialize Chroma
vectorstore = chroma_client.get_or_create_collection(name="Skyminer")

In [None]:
# Lists to store the extracted information from documents
documents_list = []
embeddings_list = []
metadatas_list = []
ids_list = []

# Assuming 'content' in your dataframe is what you consider as the document/page_content
for _, row in final_df.iterrows():
    embedding = model.encode(row['text'])
    
    # Constructing metadata
    metadata = {
        "source": f"{row['path']}",
        "documentation": row['documentation'],
        "file_path": row['file_path'],
        "word_count": row['word_count']
    }

    documents_list.append(row['text'])
    embeddings_list.append(embedding.tolist())
    metadatas_list.append(metadata)

# Generating IDs for the documents
ids_list = ["v" + str(i + 1) for i in range(len(documents_list))]

# Add the embedded documents to the collection in Chroma
vectorstore.add(
    documents=documents_list,
    embeddings=embeddings_list,
    metadatas=metadatas_list,
    ids=ids_list
)


In [None]:
vectorstore

In [None]:
print("There are", vectorstore.count(), "in the collection")