# Charter School NlP Fun Stuff

## Install relevant packages

In [None]:
#basic processing
import pandas as pd
import numpy as np 
#networking and scraping
import requests
from bs4 import BeautifulSoup
#text processing
import spacy
# import spacy_transformers #not really usable because of problems with cuda
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
# bring out the nuclear weapons for this job
import os
import openai
openai.api_key = os.getenv("openai")
# nltk.download('stopwords')
# nltk.download("puntk")
#nltk.data.path.append("C://Users//chris//AppData//Roaming//nltk_data")
from dotenv import load_dotenv
load_dotenv()


In [None]:
# Load the data
input_data=pd.read_csv('data/active_charter_schools_report.csv')
# The columns are just ugly, so let's clean them up
new_columns = input_data.columns.str.replace("-","").str.replace(" ","").str.lower() 
input_data.columns = new_columns

In [None]:
input_data.columns

In [None]:
input_data.head()

In [None]:
input_data['website']=input_data['principal/directoremail'].str.split('@').str[-1]
#list of columns I care about
charter_school_column=['countydescription','schoolname','principal/directoremail','website']

In [None]:
input_data[charter_school_column].head(30)

## Search up board of directors of each google link and get ready to parse the data

In [None]:
import requests
from bs4 import BeautifulSoup

def get_google_search_results_boards(school_name: str):
    # set the school name to search for
    # create a Google search query URL
    query_url = f"https://www.google.com/search?q={school_name}+meet+the+board+of+directors"
    # make a request to the query URL and get the HTML content
    response = requests.get(query_url)
    html_content = response.text
    # parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    # find all the search result links on the page
    search_result_links = soup.find_all('a')
    # extract the URLs of the first search result link from each search engine
    result_urls = []
    for link in search_result_links:
        href = link.get('href')
        if href.startswith('/url?q='):
            result_url = href.split('/url?q=')[1].split('&')[0]
            result_urls.append(result_url)
        if len(result_urls) == 1:
            break

    # print the resulting URLs
    return(result_urls)


In [None]:
input_data['board_of_directors_link']=input_data['schoolname'].apply(get_google_search_results_boards)

In [None]:
#input_data.to_csv('checkdata.csv',index=False)
#checkpoint read input data
input_data = pd.read_csv('checkdata.csv')

In [None]:
# loop through each URL and scrape the text content
def website_text_content(url: str):
    # make a request to the URL and get the HTML content
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    response = requests.get(url, headers=headers)
    html_content = response.text
    # parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    # remove the header and footer from the HTML content
    for tag in soup(['header', 'footer']):
        tag.decompose()
    # extract the text content from the HTML
    text_content = soup.get_text().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').replace('.', ' ')

    #stop_words = set(stopwords.words('english'))
    #tokens = nltk.word_tokenize(text_content)
    #filtered_tokens = [token.lower() for token in tokens if token.lower() not in stop_words and len(token) > 1]
    #filtered_text = ' '.join(filtered_tokens)
    
    # remove extra spaces and line breaks
    text_content = re.sub(r"\s+", " ", text_content).strip()
    # add spaces between words that are capitalized
    text_content = re.sub(r'(?<!^)(?=[A-Z])', ' ', text_content)
    # remove punctuation
    words = text_content.split()
    # remove single word characters because of middle initials
    filtered_words = [word for word in words if len(word) > 1]
    filtered_text = ' '.join(filtered_words)
    
    return filtered_text
    # print the text content


In [None]:
def clean_string(s: str):
    # Remove digits
    s = re.sub(r'\d+', '', s)
    # Remove special characters
    s = re.sub(r'[^\w\s]', '', s)
    # Remove non-readable characters
    s = ''.join(filter(lambda x: x.isprintable(), s))
    return s

In [None]:
input_data['Hyperlink'] = input_data['board_of_directors'].str.strip("[]").str.strip("''")

In [None]:
input_data['director_text_content']=input_data['Hyperlink'].apply(website_text_content)

In [None]:
#input_data.to_csv('check_data_point2.csv',escapechar='\\', index=False)
#test_input = pd.read_csv('check_data_point2.csv')

## Refine function to find multiple people - you should really try to experiment with different models because its really annoying to find people.

In [None]:
def find_names_sm(text_content: str):
    nlp = spacy.load("en_core_web_sm") # you can change the model to bigger ones like "en_core_web_trf" - problematic with cuda and spacy_transformers
    # process the text with the NER model
    doc = nlp(text_content)
    # extract the named entities (people's names)
    names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    # drop duplicates
    names=list(set(names))
    # print the names
    return names

In [None]:
def find_names_md(text_content: str):
    nlp = spacy.load("en_core_web_md") # you can change the model to bigger ones like "en_core_web_trf" - problematic with cuda and spacy_transformers
    # process the text with the NER model
    doc = nlp(text_content)
    # extract the named entities (people's names)
    names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    # Drop Duplicates
    names=list(set(names))
    # print the names
    return names

In [None]:
def find_names_lg(text_content: str):
    nlp = spacy.load("en_core_web_lg") # you can change the model to bigger ones like "en_core_web_trf" - problematic with cuda and spacy_transformers
    # process the text with the NER model
    doc = nlp(text_content)
    # extract the named entities (people's names)
    names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    # Drop Duplicates
    names=list(set(names))
    # print the names
    return names

In [None]:
input_data['director_names_sm']=input_data['director_text_content'].apply(find_names_sm)
input_data['director_names_md']=input_data['director_text_content'].apply(find_names_md)
input_data['director_names_lg']=input_data['director_text_content'].apply(find_names_lg)

In [None]:
#input_data.to_csv('check_data_point3.csv',escapechar='\\', index=False)

## Nuclear option goooodbye monies

10$ of my money is gone

In [None]:
def find_names_openai(text_content:str):
  
  response = openai.Completion.create(
    model="text-davinci-003",
    prompt=f"context: {text_content} \n\n question: Who are the board members with first and last names?, give it to me as a python list If none are available give me NA",
    temperature=0.86,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
  )
  return(response.choices[0].text)

# Define function to chunk a long text into smaller pieces

def chunk_text(text, chunk_size=3000):
    """Chunk a string into a list of strings with a maximum size of chunk_size."""
    chunks = []
    start = 0
    end = chunk_size
    while start < len(text):
        if end >= len(text):
            end = len(text)
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size
        end += chunk_size
    return chunks

def find_names_in_chunks(text_chunks:list):
  
  results = []
  
  for chunk in text_chunks:
    names = find_names_openai(chunk)
    results.append(names)
  
  return results

In [None]:
input_data['clean_director_text_content']= input_data['director_text_content'].apply(clean_string)
input_data['text_chunks'] = input_data['clean_director_text_content'].apply(chunk_text)
#input_data['director_names_openai']=input_data['text_chunks'].apply(find_names_in_chunks) # that was 10$ 

In [None]:
#input_data.to_csv('check_data_point4.csv',escapechar='\\', index=False)
input_data = pd.read_csv('check_data_point4.csv')

In [None]:
## checkpoint marker
# charter_school_column=['countydescription','schoolname','principal/directoremail','website','director_names_openai']
# input_data[charter_school_column].to_csv('director_name.csv',index=False)

In [None]:
def openai_cleaner(input_string:str):
    input_string = input_string.replace('\'','')
    input_string = input_string.replace('[','')
    input_string = input_string.replace(']','')
    input_string = input_string.replace('\"','')
    input_string = input_string.replace("NA", "")
    input_string = input_string.replace("\n", "")
    input_string = input_string.replace("\\\\n\\\\n", "")
    input_string=input_string.replace("(", "").replace(")", "")
    input_string=input_string.replace("\\\\n", "")
    input_string=input_string.replace("Answer:", "")
    input_string = re.sub(r'(\n[\s]*)+', ' ', input_string)
    # Remove patterns such as \n\nAnswer:, \n\n(, ), and (list)
    patterns = ['Answer:', '\(\w+\)', '\(list\)']
    for pattern in patterns:
        input_string = re.sub(r'\n\n{}[\s]*'.format(pattern), ' ', input_string)
    # Remove leading and trailing whitespaces
    input_string = input_string.strip()
    words_to_remove = ['Director', 'Assistant', 'Parent Seat','Parent', 'Seat', 'Community', 'Board', 'Treasurer','Alumni','-','Business Analyst']
    
    for word in words_to_remove:
        input_string = input_string.replace(word, "")
    
    return input_string 

# def second_cleaner(input_string:list):
#     words_to_remove = ['Director', 'Assistant', 'Parent Seat','Parent', 'Seat', 'Community', 'Board', 'Treasurer','Alumni','-']
#     for word in words_to_remove:
#         input_string = input_string.str.replace(word, '', regex=False)
#     return input_string

# def remove_words(words_list: list):
#     remove_list = ['Director', 'Assistant', 'Parent Seat','Parent', 'Seat', 'Community', 'Board', 'Treasurer','Alumni','-']
#     return [word for word in words_list if word not in remove_list]

In [None]:
input_data['director_names_openai_clean']=input_data['director_names_openai'].apply(openai_cleaner)
input_data['cleaner_names'] = input_data['director_names_openai_clean'].apply(find_names_lg)

In [None]:
input_data['cleaner_names']

In [None]:
input_data['cleaner_names'][10]

In [None]:
# len(input_data[input_data['cleaner_names'] == '[]' ])
def count_empty_lists(lst):
    return len(lst) == 0
# apply the function to each element of the column and count the number of empty lists
num_empty_lists = input_data['cleaner_names'].apply(count_empty_lists).sum()
print(num_empty_lists) 

In [None]:
input_data.columns

### 74 schools that dont have a list of board of directors

In [None]:
input_data[['schoolname','officialschoolname','zipcode5','Hyperlink','cleaner_names']].to_csv('names_cleaning.csv',index=False)

## Bert Model has a problem with black names :(

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Sarah Crofer and I live in Berlin"

ner_results = nlp(example)
names = []
this_name = []
all_names_list_tmp = []

for ner_dict in ner_results:
    if ner_dict['entity'] == 'B-PER':
        if len(this_name) == 0:
            this_name.append(ner_dict['word'])
        else:
            all_names_list_tmp.append([this_name])
            this_name = []
            this_name.append(ner_dict['word'])
    elif ner_dict['entity'] == 'I-PER':
        this_name.append(ner_dict['word'])

all_names_list_tmp.append([this_name])

print(all_names_list_tmp)

final_name_list = []
for name_list in all_names_list_tmp:
    full_name = ' '.join(name_list[0]).replace(' ##', '').replace(' .', '.')
    final_name_list.append([full_name])

print(final_name_list)

# for result in ner_results:
#     if result['entity'] == 'B-PER' or result['entity'] == 'I-PER':
#         names.append(result['word'])

#print(names)

## Coding archive, we probably could use this to make it better next time

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import spacy

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
model = BertModel.from_pretrained('bert-large-cased')

# Load the spaCy model for named entity recognition
nlp = spacy.load("en_core_web_md")

# Define the text to analyze
text = test_content

# Tokenize the text and convert to PyTorch tensors
tokens = tokenizer.tokenize(text)
tokens = ['[CLS]'] + tokens + ['[SEP]']
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_tensor = torch.tensor([token_ids])

# Run the text through the BERT model
with torch.no_grad():
    last_hidden_states = model(token_tensor)[0]

# Extract the named entities using spaCy
named_entities = []
doc = nlp(text)
for ent in doc.ents:
    if ent.label_ == "PERSON":
        named_entities.append(ent.text)

print(named_entities)

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(text_content)

# Initialize empty lists to store first and last names
first_names = []
last_names = []

# Iterate through the entities in the document and check if they are a person entity
for ent in doc.ents:
    if ent.label_ == "PERSON":
        # Split the entity text into tokens
        tokens = ent.text.split()
        
        # If there are two or more tokens, assume the first is the first name and the last is the last name
        if len(tokens) >= 2:
            first_names.append(tokens[0])
            last_names.append(tokens[-1])
            
# Print the extracted first and last names
print("First names:", first_names)
print("Last names:", last_names)

In [None]:
#holy shit finding peoples names are difficult

## refine people's names and seperate by first and last name - find likely voter registration affilation.

## Voter Lookup 

In [None]:
state_voter=pd.read_csv('ncvoter_Statewide.txt',sep="\t", header=0,encoding='ISO-8859-1')

In [None]:
state_voter.columns

In [None]:
state_voter[['voter_reg_num','first_name','last_name','birth_year','zip_code','race_code','ethnic_code','party_cd','gender_code']]

In [None]:
state_voter.shape

In [None]:
state_voter[(state_voter['first_name']=='CARY') & (state_voter['last_name']=='CAIN')].to_csv('voter_example.csv',index = False)