# Charter School NlP Fun Stuff

## Install relevant packages

In [None]:
#basic processing
import pandas as pd
import numpy as np 
#networking and scraping
import requests
from bs4 import BeautifulSoup
#text processing
import spacy
# import spacy_transformers #not really usable because of problems with cuda
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
# bring out the nuclear weapons for this job
import os
import openai
#openai.api_key = os.getenv("openai")
# nltk.download('stopwords')
# nltk.download("puntk")
#nltk.data.path.append("C://Users//chris//AppData//Roaming//nltk_data")
from dotenv import load_dotenv
load_dotenv()
import Levenshtein
import ast


In [None]:
# Load the data
input_data=pd.read_csv('data/active_charter_schools_report.csv')
# The columns are just ugly, so let's clean them up
new_columns = input_data.columns.str.replace("-","").str.replace(" ","").str.lower() 
input_data.columns = new_columns

In [None]:
input_data.columns

In [None]:
input_data.head()

In [None]:
input_data['website']=input_data['principal/directoremail'].str.split('@').str[-1]
#list of columns I care about
charter_school_column=['countydescription','schoolname','principal/directoremail','website']

In [None]:
input_data[charter_school_column].head(15)

## Search up board of directors of each google link and get ready to parse the data

In [None]:
import requests
from bs4 import BeautifulSoup

## search for the board of directors page from site

def get_google_search_results_boards(school_name: str):
    # set the school name to search for
    # create a Google search query URL
    query_url = f"https://www.google.com/search?q={school_name}+meet+the+board+of+directors"
    # make a request to the query URL and get the HTML content
    response = requests.get(query_url)
    html_content = response.text
    # parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    # find all the search result links on the page
    search_result_links = soup.find_all('a')
    # extract the URLs of the first search result link from each search engine
    result_urls = []
    for link in search_result_links:
        href = link.get('href')
        if href.startswith('/url?q='):
            result_url = href.split('/url?q=')[1].split('&')[0]
            result_urls.append(result_url)
        if len(result_urls) == 1:
            break

    # print the resulting URLs
    return(result_urls)


In [None]:
input_data['board_of_directors_link']=input_data['schoolname'].apply(get_google_search_results_boards)

In [None]:
feature_columns=['countydescription','schoolname','principal/directoremail','zipcode5','boardchairfirstname','boardchairlastname','website','board_of_directors_link']
input_data[feature_columns].to_csv('data/checkdata.csv',index=False)
#checkpoint read input data
#input_data = pd.read_csv('data/checkdata.csv')

## Getting feature google snippets are problematic

In [None]:
# import requests
# from bs4 import BeautifulSoup

# def get_google_feature_snippet(url):
#     # Send a GET request to the URL and retrieve the HTML content
#     response = requests.get(url)
#     html = response.text
    
#     # Parse the HTML content using BeautifulSoup
#     soup = BeautifulSoup(html, 'html.parser')
    
#     # Find the Google feature snippet element, if one exists
#     feature_snippet = soup.find('div', class_='kp-header')
    
#     # If a feature snippet was found, extract its text content and return it
#     if feature_snippet:
#         return feature_snippet.text.strip()
#     else:
#         return None

## Add function that would determine if link is part of original domain

In [None]:
from urllib.parse import urlparse

def compare_domains(url1, url2):
    domain1 = urlparse(url1).netloc.replace("www.", "")
    domain2 = urlparse(url2).netloc.replace("www.", "")
    return domain1 == domain2

In [None]:
input_data['link_domain_match']=input_data.apply(lambda x: compare_domains(f"https://{x['website']}",x['board_of_directors_link'][0]),axis=1)

In [None]:
input_data[['link_domain_match','board_of_directors_link','website']].sample(4)

In [None]:
input_data['link_domain_match'].value_counts()

## Add data scrapper function for per string - some of these functions are used to test the data scrapper and whos the best

In [None]:
# loop through each URL and scrape the text content
def website_text_content(url: str):
    # make a request to the URL and get the HTML content
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    response = requests.get(url, headers=headers)
    html_content = response.text
    # parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    # remove the header and footer from the HTML content
    for tag in soup(['header', 'footer']):
        tag.decompose()
    # extract the text content from the HTML
    text_content = soup.get_text().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').replace('.', ' ')

    #stop_words = set(stopwords.words('english'))
    #tokens = nltk.word_tokenize(text_content)
    #filtered_tokens = [token.lower() for token in tokens if token.lower() not in stop_words and len(token) > 1]
    #filtered_text = ' '.join(filtered_tokens)
    
    # remove extra spaces and line breaks
    text_content = re.sub(r"\s+", " ", text_content).strip()
    # add spaces between words that are capitalized
    text_content = re.sub(r'(?<!^)(?=[A-Z])', ' ', text_content)
    # remove punctuation
    words = text_content.split()
    # remove single word characters because of middle initials
    filtered_words = [word for word in words if len(word) > 1]
    filtered_text = ' '.join(filtered_words)
    
    return filtered_text
    # print the text content


In [None]:
def scrape_middle_content(url:str):
    try:
        headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
        # Fetch the content from the URL
        response = requests.get(url,headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while fetching the URL: {e}")
        return None

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the main content container
    main_container = soup.find("main") or soup.find("article") or soup.find("div", {"role": "main"})

    if main_container is None:
        print("Could not find the main content container.")
        return None

    # Find the middle content
    content_elements = main_container.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'blockquote'])

    if not content_elements:
        print("Could not find any content elements.")
        return None

    middle_index = len(content_elements) // 2
    middle_content = content_elements[middle_index]

    return middle_content

In [None]:
def extract_main_content(url):
    try:
        headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
        response = requests.get(url,headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while fetching the URL: {e}")
        return None

    soup = BeautifulSoup(response.content, "html.parser")

    # Remove unwanted elements
    for tag in ['header', 'footer', 'nav']:
        elements_to_remove = soup.find_all(tag)
        for elem in elements_to_remove:
            elem.decompose()

    # Extract text from the remaining elements
    text_content = ' '.join(soup.stripped_strings)
    # remove extra spaces and line breaks
    text_content = re.sub(r"\s+", " ", text_content).strip()
    # add spaces between words that are capitalized
    text_content = re.sub(r'(?<!^)(?=[A-Z])', ' ', text_content)
    # remove punctuation
    words = text_content.split()
    # remove single word characters because of middle initials
    filtered_words = [word for word in words if len(word) > 1]
    filtered_text = ' '.join(filtered_words)

    return filtered_text

In [None]:
def clean_string(s: str):
    # Remove digits
    s = re.sub(r'\d+', '', s)
    # Remove special characters
    s = re.sub(r'[^\w\s]', '', s)
    # Remove non-readable characters
    s = ''.join(filter(lambda x: x.isprintable(), s))
    return s

import string

def remove_punctuation_and_single_chars(text):
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Remove single character words
    words = text.split()
    filtered_words = [word for word in words if len(word) > 1]
    result = ' '.join(filtered_words)

    return result

In [None]:
input_data['board_of_directors_link']=input_data['board_of_directors_link'].astype(str)
input_data['board_of_directors_link']=input_data['board_of_directors_link'].str.strip("[]").str.strip("''")

In [None]:
input_data['director_text_content']=input_data['board_of_directors_link'].apply(extract_main_content)

In [None]:
#input_data.to_csv('check_data_point2.csv',escapechar='\\', index=False)
input_data = pd.read_csv('check_data_point2.csv')

## Refine function to find multiple people - you should really try to experiment with different models because its really annoying to find people.

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

def extract_names_bert(text):
    model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=-1)
    tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])

    names = []
    current_name_tokens = []
    for token, label_id in zip(tokens, predictions[0]):
        label = model.config.id2label[label_id.item()]
        
        if label == "B-PER":
            if current_name_tokens:
                names.append(tokenizer.convert_tokens_to_string(current_name_tokens))
            current_name_tokens = [token]
        elif label == "I-PER":
            current_name_tokens.append(token)
        else:
            if current_name_tokens:
                names.append(tokenizer.convert_tokens_to_string(current_name_tokens))
                current_name_tokens = []

    if current_name_tokens:
        names.append(tokenizer.convert_tokens_to_string(current_name_tokens))

    # Extract first and last names
    
    first_last_names = []
    for name in names:
        name_parts = name.split()
        if len(name_parts) >= 2:
            first_last_names.append((name_parts[0], name_parts[-1]))
    
    first_last_names=list(set(first_last_names))

    return first_last_names

## Spacy Models are useless for NER stick to pretrain bert models

In [None]:
def find_names_sm(text_content: str):
    nlp = spacy.load("en_core_web_sm") # you can change the model to bigger ones like "en_core_web_trf" - problematic with cuda and spacy_transformers
    # process the text with the NER model
    doc = nlp(text_content)
    # extract the named entities (people's names)
    names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    # drop duplicates
    names=list(set(names))
    # print the names
    return names

In [None]:
def find_names_md(text_content: str):
    nlp = spacy.load("en_core_web_md") # you can change the model to bigger ones like "en_core_web_trf" - problematic with cuda and spacy_transformers
    # process the text with the NER model
    doc = nlp(text_content)
    # extract the named entities (people's names)
    names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    # Drop Duplicates
    names=list(set(names))
    # print the names
    return names

In [None]:
def find_names_lg(text_content: str):
    nlp = spacy.load("en_core_web_lg") # you can change the model to bigger ones like "en_core_web_trf" - problematic with cuda and spacy_transformers
    # process the text with the NER model
    doc = nlp(text_content)
    # extract the named entities (people's names)
    names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    # Drop Duplicates
    names=list(set(names))
    # print the names
    return names

In [None]:
#input_data['director_text_content'][40]
(input_data['director_text_content'][90])

In [None]:
input_data['director_text_content'] = input_data['director_text_content'].astype(str)
input_data['director_first_last']=input_data['director_text_content'].apply(extract_names_bert) # 10 minutes to complete

In [None]:
## practically useless
input_data['director_names_sm']=input_data['director_text_content'].apply(find_names_sm)
input_data['director_names_md']=input_data['director_text_content'].apply(find_names_md)
input_data['director_names_lg']=input_data['director_text_content'].apply(find_names_lg)

In [None]:
input_data.columns

In [None]:
charter_school_column=['lea','zipcode5','countydescription','schoolname','principal/directoremail','website','board_of_directors_link','link_domain_match','director_first_last']

In [None]:
input_data[charter_school_column].to_csv('check_data_point3.csv',escapechar='\\', index=False)

## Nuclear option goooodbye monies

10$ of my money is gone

In [None]:
def find_names_openai(text_content:str):
  
  response = openai.Completion.create(
    model="text-davinci-003",
    prompt=f"context: {text_content} \n\n question: Who are the board members with first and last names?, give it to me as a python list If none are available give me an empty list.",
    temperature=0.86,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
  )
  return(response.choices[0].text)

# Define function to chunk a long text into smaller pieces

def chunk_text(text, chunk_size=5000):
    """Chunk a string into a list of strings with a maximum size of chunk_size."""
    chunks = []
    start = 0
    end = chunk_size
    while start < len(text):
        if end >= len(text):
            end = len(text)
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size
        end += chunk_size
    return chunks

def find_names_in_chunks(text_chunks:list):
  
  results = []
  
  for chunk in text_chunks:
    names = find_names_openai(chunk)
    results.append(names)
  
  return results

In [None]:
input_data['clean_director_text_content']= input_data['director_text_content'].apply(clean_string)
input_data['text_chunks'] = input_data['clean_director_text_content'].apply(chunk_text)
#input_data['director_names_openai']=input_data['text_chunks'].apply(find_names_in_chunks) # that was 10$ 

In [None]:
#input_data.to_csv('check_data_point4.csv',escapechar='\\', index=False)
input_data = pd.read_csv('check_data_point4.csv')

In [None]:
## checkpoint marker
# charter_school_column=['countydescription','schoolname','principal/directoremail','website','director_names_openai']
# input_data[charter_school_column].to_csv('director_name.csv',index=False)

In [None]:
def openai_cleaner(input_string:str):
    input_string = input_string.replace('\'','')
    input_string = input_string.replace('[','')
    input_string = input_string.replace(']','')
    input_string = input_string.replace('\"','')
    input_string = input_string.replace("NA", "")
    input_string = input_string.replace("\n", "")
    input_string = input_string.replace("\\\\n\\\\n", "")
    input_string=input_string.replace("(", "").replace(")", "")
    input_string=input_string.replace("\\\\n", "")
    input_string=input_string.replace("Answer:", "")
    input_string = re.sub(r'(\n[\s]*)+', ' ', input_string)
    # Remove patterns such as \n\nAnswer:, \n\n(, ), and (list)
    patterns = ['Answer:', '\(\w+\)', '\(list\)']
    for pattern in patterns:
        input_string = re.sub(r'\n\n{}[\s]*'.format(pattern), ' ', input_string)
    # Remove leading and trailing whitespaces
    input_string = input_string.strip()
    words_to_remove = ['Director', 'Assistant', 'Parent Seat','Parent', 'Seat', 'Community', 'Board', 'Treasurer','Alumni','-','Business Analyst']
    
    for word in words_to_remove:
        input_string = input_string.replace(word, "")
    
    return input_string 

# def second_cleaner(input_string:list):
#     words_to_remove = ['Director', 'Assistant', 'Parent Seat','Parent', 'Seat', 'Community', 'Board', 'Treasurer','Alumni','-']
#     for word in words_to_remove:
#         input_string = input_string.str.replace(word, '', regex=False)
#     return input_string

# def remove_words(words_list: list):
#     remove_list = ['Director', 'Assistant', 'Parent Seat','Parent', 'Seat', 'Community', 'Board', 'Treasurer','Alumni','-']
#     return [word for word in words_list if word not in remove_list]

In [None]:
input_data['director_names_openai_clean']=input_data['director_names_openai'].apply(openai_cleaner)
input_data['cleaner_names'] = input_data['director_names_openai_clean'].apply(find_names_lg)

In [None]:
input_data['cleaner_names']

In [None]:
input_data['cleaner_names'][10]

In [None]:
# len(input_data[input_data['cleaner_names'] == '[]' ])
def count_empty_lists(lst):
    return len(lst) == 0
# apply the function to each element of the column and count the number of empty lists
num_empty_lists = input_data['cleaner_names'].apply(count_empty_lists).sum()
print(num_empty_lists) 

In [None]:
input_data.columns

### 74 schools that dont have a list of board of directors

In [None]:
input_data[['schoolname','officialschoolname','zipcode5','Hyperlink','cleaner_names']].to_csv('names_cleaning.csv',index=False)

## Bert Model has a problem with black names :(

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Sarah Crofer and I live in Berlin"

ner_results = nlp(example)
names = []
this_name = []
all_names_list_tmp = []

for ner_dict in ner_results:
    if ner_dict['entity'] == 'B-PER':
        if len(this_name) == 0:
            this_name.append(ner_dict['word'])
        else:
            all_names_list_tmp.append([this_name])
            this_name = []
            this_name.append(ner_dict['word'])
    elif ner_dict['entity'] == 'I-PER':
        this_name.append(ner_dict['word'])

all_names_list_tmp.append([this_name])

print(all_names_list_tmp)

final_name_list = []
for name_list in all_names_list_tmp:
    full_name = ' '.join(name_list[0]).replace(' ##', '').replace(' .', '.')
    final_name_list.append([full_name])

print(final_name_list)

# for result in ner_results:
#     if result['entity'] == 'B-PER' or result['entity'] == 'I-PER':
#         names.append(result['word'])

#print(names)

## Coding archive, we probably could use this to make it better next time

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import spacy

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
model = BertModel.from_pretrained('bert-large-cased')

# Load the spaCy model for named entity recognition
nlp = spacy.load("en_core_web_md")

# Define the text to analyze
text = test_content

# Tokenize the text and convert to PyTorch tensors
tokens = tokenizer.tokenize(text)
tokens = ['[CLS]'] + tokens + ['[SEP]']
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_tensor = torch.tensor([token_ids])

# Run the text through the BERT model
with torch.no_grad():
    last_hidden_states = model(token_tensor)[0]

# Extract the named entities using spaCy
named_entities = []
doc = nlp(text)
for ent in doc.ents:
    if ent.label_ == "PERSON":
        named_entities.append(ent.text)

print(named_entities)

In [None]:
#holy shit finding peoples names are difficult
## refine people's names and seperate by first and last name - find likely voter registration affilation.

## Voter Lookup 

In [None]:
state_voter=pd.read_csv('data/ncvoter_Statewide.txt',sep="\t", header=0,encoding='ISO-8859-1')
#input_data_copy = pd.read_excel('data/check_data_point3.xlsx')

In [None]:
state_voter.columns

In [None]:
state_voter[['voter_reg_num','voter_status_desc','first_name','last_name','birth_year','zip_code','race_code','ethnic_code','party_cd','gender_code']]

In [None]:
state_voter=state_voter.loc[(state_voter['voter_status_desc'] == 'ACTIVE') & (state_voter['zip_code'].notna())]

In [None]:
state_voter[(state_voter['first_name']=='CARY') & (state_voter['last_name']=='CAIN')]

In [None]:
state_voter=state_voter.dropna(subset=['first_name','last_name'], how='all')

In [None]:
state_voter[['first_name','last_name']]=state_voter[['first_name','last_name']].astype(str)

In [None]:
state_voter['full_name'] = state_voter[['first_name','last_name']].apply(lambda x: ' '.join(x), axis=1)

## zipcode calculator

## Levenshtein distance lexical name similarity

In [None]:
def lexical_similarity(name, name_column):
    name_column = name_column.apply(lambda x: Levenshtein.ratio(name, x))
    return name_column

In [None]:
input_data_copy.columns

In [None]:
index_number = 3
print(input_data['schoolname'][index_number])
print(input_data['zipcode5'][index_number])
input_data_copy['cleaner_names'][3]

In [None]:
file_path = "data/US.txt"
us_zip = pd.read_table(file_path, header=None)
us_zip.columns = ["country_code","postal_code", "place_name", "admin_name1", "admin_code1",
              "admin_name2", "admin_code2", "admin_name3", "admin_code3",
              "latitude", "longitude", "accuracy"]
us_zip['postal_code'] = us_zip['postal_code'].apply(lambda x : str(x).zfill(5))
state_voter['zip_code']=state_voter['zip_code'].astype(int)
state_voter['zip_code']=state_voter['zip_code'].astype(str)
state_voter=pd.merge(state_voter,us_zip[['postal_code', 'latitude', 'longitude']],left_on='zip_code',right_on='postal_code',how='left') 

In [None]:
# name_to_compare = 'BRANDON RUSSELL'
# similarity_scores = lexical_similarity(name_to_compare, state_voter['full_name'])
# state_voter['Similarity'] = similarity_scores
# parse_zip=state_voter[['full_name','party_cd','Similarity','zip_code','latitude','longitude']].sort_values(by='Similarity', ascending=False).head(50)
# parse_zip.reset_index(inplace=True)

In [None]:
from geopy.distance import geodesic

def calculate_distance(lat, lon, lat_column, lon_column):
    distance_column = []
    for lat2, lon2 in zip(lat_column, lon_column):
        distance = geodesic((lat, lon), (lat2, lon2)).miles
        distance_column.append(distance)
    return pd.Series(distance_column)

# # Reference coordinates (latitude, longitude)
# zipcode = '27217'
# ref_lat = us_zip['latitude'].loc[us_zip['postal_code'] == zipcode].iloc[0]
# ref_lon = us_zip['longitude'].loc[us_zip['postal_code'] == zipcode].iloc[0]

# # Calculate distances and add as a new column in the DataFrame
# parse_zip['Distance']=calculate_distance(ref_lat, ref_lon, parse_zip['latitude'], parse_zip['longitude'])

#parse_zip

In [None]:
# board_party_affilation=parse_zip.sort_values(by = ['Similarity', 'Distance'], ascending = [False,True]).head(1)['party_cd'].values.tolist()
# party_list =[]
# party_list.append(board_party_affilation[0])
# party_list

In [None]:
# take note of this issue
state_voter[state_voter["latitude"].isnull() | state_voter["longitude"].isnull()]['zip_code']

In [None]:
us_zip[us_zip.postal_code == '27838']

In [None]:
def find_political_affilation(list_names:list,lat,lon):
    party_list =[]
    for names in list_names:
        name_to_compare = names.upper()
        #print(name_to_compare)
        similarity_scores = lexical_similarity(name_to_compare, state_voter['full_name'])
        state_voter['Similarity'] = similarity_scores
        parse_zip=state_voter[['full_name','party_cd','Similarity','zip_code','latitude','longitude']].sort_values(by='Similarity', ascending=False).head(50)
        parse_zip.reset_index(inplace=True)
        ref_lat = lat
        ref_lon = lon
        parse_zip['Distance']=calculate_distance(ref_lat, ref_lon, parse_zip['latitude'], parse_zip['longitude'])
        board_party_affilation=parse_zip.sort_values(by = ['Similarity', 'Distance'], ascending = [False,True]).head(1)['party_cd'].values.tolist()
        party_list.append(board_party_affilation[0])
    return party_list

In [None]:
import heapq
from typing import List, Tuple
#gpt4 political affiliation help on how to do this
def optimize_find_political_affiliation(list_names: List[str], lat: float, lon: float) -> List[str]:
    party_list = []
    for name in list_names:
        name = name.upper()
        scores = []
        for i, voter_name in enumerate(state_voter['full_name']):
            similarity = lexical_similarity(name, voter_name)
            if len(scores) < 50:
                # If we don't have 50 scores yet, just add it to the heap.
                heapq.heappush(scores, (similarity, i))
            else:
                # If we already have 50 scores, add the new one and remove the smallest.
                heapq.heappushpop(scores, (similarity, i))
        # Get the indices of the top 50 most similar names.
        top_indices = [i for _, i in scores]
        # Calculate distances for these 50 records.
        distances = calculate_distance(lat, lon, state_voter.loc[top_indices, 'latitude'], state_voter.loc[top_indices, 'longitude'])
        # Pair up similarities and distances, then sort by similarity first and distance second.
        sorted_records = sorted(zip(scores, distances), key=lambda x: (-x[0][0], x[1]))
        # Get the political affiliation of the top record.
        top_record_index = sorted_records[0][0][1]
        top_party = state_voter.loc[top_record_index, 'party_cd']
        party_list.append(top_party)
    return party_list

In [None]:
input_data_copy=pd.read_csv('data/final_checkpoint.csv',encoding='latin-1')
## data preprocessing after reading
input_data_copy['zipcode']=input_data_copy['zipcode'].astype(str)
input_data_copy['cleaner_names']=input_data_copy['cleaner_names'].apply(ast.literal_eval)
#input_data_copy=pd.merge(input_data_copy,us_zip[['postal_code', 'latitude', 'longitude']],left_on='zipcode',right_on='postal_code',how='left') 


In [None]:
input_data_copy['political_affilation']=input_data_copy.apply(lambda x: find_political_affilation(x['cleaner_names'],x['latitude'],x['longitude']),axis=1)

In [None]:
input_data_copy.to_csv('data/final_checkpoint_clean.csv',index=False)

In [None]:
find_political_affilation(input_data_copy_test['cleaner_names'].apply(ast.literal_eval)[0],input_data_copy_test['zipcode'][0])

In [None]:
def count_dem(lst):
    return lst.count('DEM')
def count_rep(lst):
    return lst.count('REP')
def count_una(lst):
    return lst.count('UNA')

In [None]:
input_data_copy['count_dem']=input_data_copy['political_affilation'].apply(count_dem)
input_data_copy['count_rep']=input_data_copy['political_affilation'].apply(count_rep)
input_data_copy['count_una']=input_data_copy['political_affilation'].apply(count_una)

In [None]:
input_data_copy.head(5)

In [None]:
input_data_copy.sort_values(by = ['count_dem'], ascending = [False]).head(30)

In [None]:
input_data_copy.to_csv('data/final_checkpoint_clean.csv',index=False)