In [None]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
from urllib.parse import urlparse

In [None]:
def extract_content(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        with requests.get(url, headers=headers) as response:
            response.raise_for_status()
            try:
                soup = BeautifulSoup(response.content, 'html.parser')
                for tag in ['head','header', 'footer','foot,''navigation','nav','dropdown']:
                    elements_to_remove = soup.find_all(tag)
                    for elem in elements_to_remove:
                        elem.decompose()
                # Extract text from the remaining elements
                text_content = ' '.join(soup.stripped_strings)
                # remove extra spaces and line breaks
                text_content = re.sub(r"\s+", " ", text_content).strip()
                # add spaces between words that are capitalized
                text_content = re.sub(r'(?<!^)(?=[A-Z])', ' ', text_content)
                # remove punctuation
                words = text_content.split()
                # remove single word characters because of middle initials
                filtered_words = [word for word in words if len(word) > 1]
                filtered_text = ' '.join(filtered_words)
                return filtered_text
            
            except Exception as e:
                print(f"An error occurred while parsing HTML for URL {url}: {e}")
                return None
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while processing URL {url}: {e}")
        return None

def process_links(links):
    max_threads = 10
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        futures_to_link = {executor.submit(extract_content, link): link for link in links}
        for future in as_completed(futures_to_link):
            link = futures_to_link[future]
            try:
                extracted_content = future.result()
                # Process the extracted content as needed
                # ...
                return extracted_content
            except Exception as e:
                print(f"An error occurred while processing URL {link}: {e}")

# Assuming you have a list of links stored in the 'links' variable
links = ['https://www.charlottelabschool.org/']  # Your list of links
process_links(links)


In [None]:
import scrapy
from scrapy.linkextractors import LinkExtractor

class CharterSpider(scrapy.Spider):
    name = "charter_spider"

    start_urls = [
        "https://www.charlottesecondary.org/",  # Replace with the actual charter school website URL
    ]

    custom_settings = {
        'DEPTH_LIMIT': 1,  # Limit the depth of the scraping to only one level
    }

    link_counter = 0  # Initialize the link counter
    link_limit = 200  # Set the desired link limit
    keywords = ['calendar','events','schedule']  # Add more keywords as needed
    # Extract root domain to restrict link extraction
    parsed_uri = urlparse(start_urls[0])
    root_domain = parsed_uri.netloc

    def parse(self, response):
        link_extractor = LinkExtractor(allow_domains=self.root_domain)
        links = link_extractor.extract_links(response)

        for link in links:
            url = link.url
            # Process the link further if it doesn't match the calendar keywords
            if any(keyword in url.lower() for keyword in self.keywords):
                continue

            if self.link_counter >= self.link_limit:
                break  # Break the loop if the link limit is reached
            
            self.link_counter += 1
            yield scrapy.Request(url, callback=self.parse, follow=True)
            
            yield {
                "url": url
            }
            self.link_counter += 1  # Increment the link counter
            print(url)

In [None]:
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'LOG_LEVEL': 'INFO',
    'DEPTH_LIMIT': 1
})
# Start the spider
process.crawl(CharterSpider)
process.start() # the script will block here until the crawling is finished


In [None]:
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging

class CollectingPipeline(object):
    def open_spider(self, spider):
        self.data = []

    def close_spider(self, spider):
        # Do something with the collected items here
        print(self.data)

    def process_item(self, item, spider):
        self.data.append(item)
        return item

settings = {
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'LOG_LEVEL': 'INFO',
    'DEPTH_LIMIT': 3,
    'ITEM_PIPELINES': {'__main__.CollectingPipeline': 1},  # Used for pipeline 1
}

configure_logging()
runner = CrawlerRunner(settings)
d = runner.crawl(CharterSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()

In [None]:
process.stop()

In [None]:
#pd read back merge_charter for school name
#copy and paste google search link for each school name 
#use school homepage for webcrawler
#explode webcrawler results 
#scrape information from each school website 
#create embeddings for each school website using openai 
#Faiss index search up for school website embeddings


In [1]:
import pandas as pd 
input_data=pd.read_csv("processed_data/merge_charter_district.csv")

In [None]:
input_data.head(2)

In [None]:
## process the the home page url

In [None]:
import requests
from bs4 import BeautifulSoup

## search for the board of directors page from site

def get_google_search_school_website(school_name: str):
    # set the school name to search for
    # create a Google search query URL
    query_url = f"https://www.google.com/search?q={school_name}+charter+school+north+carolina"
    # make a request to the query URL and get the HTML content
    response = requests.get(query_url)
    html_content = response.text
    # parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    # find all the search result links on the page
    search_result_links = soup.find_all('a')
    # extract the URLs of the first search result link from each search engine
    result_urls = []
    for link in search_result_links:
        href = link.get('href')
        if href.startswith('/url?q='):
            result_url = href.split('/url?q=')[1].split('&')[0]
            result_urls.append(result_url)
        if len(result_urls) == 1:
            break

    # print the resulting URLs
    return(result_urls)

In [None]:
input_data['school_homepage']=input_data['schoolname'].apply(get_google_search_school_website)

In [None]:
input_data['school_homepage']

In [None]:
input_data['school_homepage']=input_data['school_homepage'].apply(lambda x: x[0])

In [None]:
input_data['school_homepage'].is_unique

In [None]:
input_data.to_csv("processed_data/merge_charter_district.csv",index=False)

In [None]:
list(input_data['school_homepage'].head())

In [None]:
input_data['school_homepage']

In [3]:
web_crawler=pd.read_csv("crawler/crawler_result.csv")

In [4]:
web_crawler

Unnamed: 0,url
0,https://www.communitydva.org/apps/pages/index....
1,https://www.communitydva.org/apps/video/watch....
2,https://www.durhamcharter.org/privacy-policy/
3,https://www.joycharter.org/virtual-tour
4,https://www.joycharter.org/troubleshooting-videos
...,...
4891,https://www.wilsonpreparatoryacademy.org/apps/...
4892,https://www.wilsonpreparatoryacademy.org/apps/...
4893,https://www.wilsonpreparatoryacademy.org/apps/...
4894,https://www.wilsonpreparatoryacademy.org/apps/...


In [5]:
from urllib.parse import urlparse

In [16]:
web_crawler['root_domain']=web_crawler['url'].apply(lambda url: urlparse(url).netloc)

In [15]:
input_data.school_homepage.apply(lambda url: urlparse(url).netloc)

0                        www.myncca.com
1                          ncva.k12.com
2                    www.joycharter.org
3                 www.durhamcharter.org
4                  www.communitydva.org
                     ...               
201    www.tworiverscommunityschool.net
202              www.dillardacademy.org
203                           wpanc.net
204         www.salliebhowardschool.com
205    www.wilsonpreparatoryacademy.org
Name: school_homepage, Length: 206, dtype: object

In [17]:
def normalize(url):
    parsed = urlparse(url)
    domain = parsed.netloc
    if domain.startswith("www."):
        domain = domain[4:]
    return domain

In [19]:
input_data['root_domain']=input_data['school_homepage'].apply(normalize)
web_crawler['root_domain']=web_crawler['url'].apply(normalize)

In [21]:
merged_data=pd.merge(input_data, web_crawler, on='root_domain')

In [22]:
merged_data.columns

Index(['countydescription', 'political_afil', 'extract_name_clean_list',
       'schoolname', 'principal/directoremail', 'website',
       'board_of_directors_link', 'link_domain_match', 'director_first_last',
       'cleaner_names', 'zipcode', 'political_affilation', 'postal_code',
       'latitude', 'longitude', 'count_dem', 'count_rep', 'count_una',
       'district_count_dem', 'district_count_rep', 'district_count_una',
       'school_homepage', 'root_domain', 'url'],
      dtype='object')

In [31]:
merged_data[['schoolname','url']].to_csv('processed_data/charter_links.csv',index=False)
