# Temp notes and ideas

* could use sentence-transformers to get sentence embeddings and then use clustering to find the most similar sentences
* txtai is a python library that can be used to find similar sentences
* LlamaIndex
* langchain
* embedchain 
* haystack with pre trained nlp model, can use as pipeline, in memory db or elasticsearch or faiss
* will need to use a transformer model to get embeddings, and index them in a db
* can use a transformer model to get embeddings and then use faiss to find similar embeddings
* can use a transformer model to get embeddings and then use clustering to find similar embeddings

Pipeline ideas:
* Hash of html sites to see if changes have been made, then repull html and update embedding for that site

# Semantic Search Pipeline

Ryan Hull, Albert Oh, Tim Hillman, Adam Lowder

This notebook demonstrates how to build a semantic search pipeline using the ___ library. The pipeline consists of the following steps:




In [4]:
# Imports
import re
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin
from urllib.parse import urlparse
import os
import sys
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from IPython.display import clear_output, display

In [8]:
FILENAME = './data/endpoints-v4.csv'
DOMAIN = 'https://charlotte.edu'
PREPROCESSED_FILENAME = './data/preprocessed-endpoints.csv'

# Read the data from the CSV file
import csv
csv.field_size_limit(2**31 - 1)

131072

## Web crawl/scrape

In [3]:
EXCLUDE = [
    'news-articles',
    'news-events',
    'news-media',
    'linkedin',
    'facebook',
    'twitter',
    'instagram',
    'youtube',
    'flickr',
    'pinterest',
    '.com',
    '.org',
    '.net',
    '.gov',
    '.pdf',
    '.doc',
    'php',
    'mailto:',
    '@',
    'tel:',
    'javascript:',
    'tel:',
    'sms:',
    'mailto:',
    'angular',
    'react',
    '.js',
    'event',
    'corporate',
    '#',
    'image',
    'gallery',
    'taskstream-student-handbook',
]

current_display = display('Starting...', display_id=True)
progress_display = display('Starting...', display_id=True)
success_count = 0
failure_count = 0

def print_status(url, status):
    global success_count
    global failure_count
    global current_display
    global progress_display
    
    if status == 'Success':
        success_count += 1
    else:
        failure_count += 1
    # Print the most recent URL and status, and the total counts    
    current_display.update(f'Most recent URL: {url} \nStatus: {status}')
    progress_display.update(f'Successes: {success_count}, Failures: {failure_count}')
    

def remove_url_prefix(url):
    url = url.replace('http://', '').replace('https://', '').replace('www.', '')
    return url.lower()

def is_valid_url(url):
    if any(ex in url for ex in EXCLUDE) or len(url) < 8 or len(url) > 100:
        return False
    try:
        split_url = re.split('https?://', url)
        return 'mailto:' not in url and '@' not in url and 'charlotte.edu' in split_url[0] or len(split_url) > 1 and 'charlotte.edu' in split_url[1]
    except Exception as e:
        print(f'Exception: {e}')
        return False
    
def write_to_csv(valid_endpoints, failed_endpoints):
    # Export the endpoints to a CSV file
    try:
        with open(FILENAME, 'w', newline='', encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow(['URL', 'Status', 'Text'])  # Write the column labels
            for endpoint in valid_endpoints:  # Write the valid endpoints
                writer.writerow(endpoint)
            for endpoint in failed_endpoints:  # Write the failed endpoints
                writer.writerow(endpoint)
            file.flush()
            os.fsync(file.fileno())
    except Exception as e:
        print(f'Exception during file write: {e}')

def fetch_url(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            return ('Success', response.content)
        else:
            return ('Failed', None)
    except (requests.exceptions.RequestException, requests.exceptions.Timeout, ValueError):
        print(f'An error occurred while fetching {url}')
        return ('Failed', None)

def crawl_domain(domain):
    visited = set()
    to_visit = [domain.rstrip('/')]
    valid_endpoints = []
    failed_endpoints = []

    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = {executor.submit(fetch_url, url): url for url in to_visit if is_valid_url(url) and url not in visited}
        visited.update(url for url in to_visit if is_valid_url(url))

        while futures:
            done, _ = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_COMPLETED)

            for future in done:
                url = futures.pop(future)

                try:
                    data = future.result()
                except Exception as e:
                    print(f'Exception in crawl loop {url}: {e}')
                    continue
                
                status, content = data
                
                try:
                    if status == 'Success':
                        soup = BeautifulSoup(content, 'html.parser')

                        # Remove JavaScript and CSS blocks, and common sections like headers, footers, etc.
                        for script in soup(["script", "style"]):  # Remove JavaScript and CSS blocks
                            script.decompose()
                        # Remove header, footer, nav, and aside
                        for tag in soup(["header", "footer", "nav", "aside"]):
                            tag.decompose() 
                        # Remove divs with class "sidebar" or "ad"
                        for div in soup.find_all("div", class_=["sidebar", "ad"]):
                            div.decompose()
                            
                        text = soup.get_text()  # Extract text from the HTML content
                        text = ' '.join(text.split())
                        valid_endpoints.append([url, status, str(text)])  # Save the text along with the URL and status
                        print_status(url, status)
                        links = soup.find_all('a')
                        for link in links:
                            href = link.get('href')
                            if href is not None:
                                full_url = urljoin(domain, href).rstrip('/')
                                clean_url = remove_url_prefix(full_url)
                                slash_count = urlparse(clean_url).path.count('/')
                                if is_valid_url(clean_url) and slash_count <= 2 and clean_url not in visited:
                                    futures[executor.submit(fetch_url, full_url)] = full_url
                                    visited.add(clean_url)
                    else:
                        failed_endpoints.append([url, status, ''])
                        visited.add(url)
                except Exception as e:
                    print_status(url, 'Failed')
                    visited.add(url)
                    continue
        write_to_csv(valid_endpoints, failed_endpoints)
# Clear the csv file
open(FILENAME, 'w').close()

# Crawl the domain
crawl_domain(DOMAIN)

'Most recent URL: https://sites.charlotte.edu/harwood/?sfid=9267&sf_paged=24 \nStatus: Success'

'Successes: 4785, Failures: 0'

An error occurred while fetching http://www.opticscenter.charlotte.edu
An error occurred while fetching https://ideas.charlotte.edu
An error occurred while fetching https://pages.charlotte.edu/employeeownership
An error occurred while fetching https://ofe.charlotte.edu
An error occurred while fetching https://pages.charlotte.edu/connections
An error occurred while fetching https://mrc.charlotte.edu
An error occurred while fetching https://inside-chess.charlotte.edu
An error occurred while fetching https://offcampushousing.charlotte.edu
An error occurred while fetching https://pages.charlotte.edu/heather-smith
An error occurred while fetching https://pages.charlotte.edu/pinku-mukherjee
An error occurred while fetching https://pages.charlotte.edu/juan-meneses
An error occurred while fetching https://registrar.charlotte.edu/webform/student-consent-access-education-records
An error occurred while fetching https://webforms.charlotte.edu/emergencyunccedu/families-nineralerts-update-form
An e



An error occurred while fetching https://pages.charlotte.edu/mc-eppes
An error occurred while fetching https://pages.charlotte.edu/david-vinson
An error occurred while fetching https://pages.charlotte.edu/youngseob-eum
An error occurred while fetching https://hrltma.charlotte.edu:81
An error occurred while fetching https://pages.charlotte.edu/aarif
An error occurred while fetching https://pages.charlotte.edu/mcarney4
An error occurred while fetching https://pages.charlotte.edu/scacace
An error occurred while fetching https://pages.charlotte.edu/lgunn
An error occurred while fetching https://pages.charlotte.edu/jean-claude-thill
An error occurred while fetching https://pages.charlotte.edu/adahl3
An error occurred while fetching https://confucius.charlotte.edu
An error occurred while fetching https://pages.charlotte.edu/jaclyn-piatak/research
An error occurred while fetching https://pages.charlotte.edu/julia-marie-robinson


In [7]:
# PreProcess the data
from collections import Counter
import re

with open(FILENAME, 'r', newline='', encoding="utf-8") as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header row
    data = list(reader)

# Split text into chunks
chunks = [re.split('\.|-', row[2]) for row in data]

# Flatten the list of chunks and count each chunk
counter = Counter(chunk for sublist in chunks for chunk in sublist)

# Define a threshold for what you consider "too frequent"
threshold = 15

# Preprocess the data
preprocessed_data = []
for row in data:
    url = row[0]
    status = row[1]
    text = row[2]
    
    if status == 'Failed' or len(text) < 100:
        continue
    
    # Remove chunks that appear too frequently
    text = '. '.join(chunk for chunk in re.split('\.|-', text) if counter[chunk] <= threshold)
    
    preprocessed_data.append([url, status, text])
    
# Write the preprocessed data to a new CSV file
with open(PREPROCESSED_FILENAME, 'w', newline='', encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(['URL', 'Status', 'Text'])  # Write the column labels
    for row in preprocessed_data:
        writer.writerow(row)
    file.flush()
    os.fsync(file.fileno())


In [2]:
%pip install txtai

Collecting txtai
  Using cached txtai-6.3.0-py3-none-any.whl.metadata (24 kB)
Collecting faiss-cpu>=1.7.1.post2 (from txtai)
  Using cached faiss_cpu-1.7.4-cp311-cp311-win_amd64.whl.metadata (1.4 kB)
Using cached txtai-6.3.0-py3-none-any.whl (205 kB)
Using cached faiss_cpu-1.7.4-cp311-cp311-win_amd64.whl (10.8 MB)
Installing collected packages: faiss-cpu, txtai
Successfully installed faiss-cpu-1.7.4 txtai-6.3.0
Note: you may need to restart the kernel to use updated packages.


## Txtai example

In [35]:
## Txtai
import txtai

# Import the preprocessed data
with open(PREPROCESSED_FILENAME, 'r', newline='', encoding="utf-8") as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header row
    data = list(reader)
    
# Create embeddings for each web page
embeddings = txtai.Embeddings(content=True)
embeddings.index([(row[0], row[2]) for row in data])

In [53]:
results = embeddings.search("computer science masters program requirements", 2)
print("computer science masters program requirements")
for result in results:
    print(result['id'])

results = embeddings.search("football team", 2)
print("football team")
for result in results:
    print(result['id'])
    
embeddings.explain("computer science masters program requirements", limit=1)

computer science masters program requirements
https://cci.charlotte.edu/academics/bachelor-degrees
https://ece.charlotte.edu/graduate-program/masters-program
football team
https://crowdfund.charlotte.edu/project/1695
https://crowdfund.charlotte.edu/project/38491


[{'id': 'https://cci.charlotte.edu/academics/bachelor-degrees',
  'text': 'Undergraduate – College of Computing and Informatics Skip to Main Content Undergraduate The College of Computing and Informatics offers a B.  and a B.  in Computer Science as well as an undergraduate certificate in Game Design.  In addition, the College offers an Honors program and an Early Entry program.  The B.  in Computer Science offers seven different concentrations.  The degree has 25 hours of core Computer Science courses, a series of courses to choose from as major electives, and 9. 15 semester hours of related work in a discipline outside computer science, possibly forming a minor in that discipline area.  The B.  in Computer Science offers four (4) concentrations and it is less structured than the B.  providing the student more flexibility in how to combine the B.  with other academic programs of study.  The B.  includes a core of 5 courses that are generally prerequisites for most upper level courses 