In [None]:
import requests
from bs4 import BeautifulSoup
import time
import random

BASE_URL = "https://www.cde.ca.gov/SchoolDirectory/districtschool?allSearch=Los+Angeles&simpleSearch=Y"

def fetch_page_data(page_number):
    url = f"{BASE_URL}&page={page_number}"
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Failed to retrieve page {page_number}. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    schools = []
    
    for row in soup.find_all('tr'):
        cells = row.find_all('td')
        if len(cells) > 0:
            school_name = cells[0].text.strip()
            street_address = cells[1].text.strip()
            street_zip = cells[2].text.strip()
            latitude = cells[3].text.strip()
            longitude = cells[4].text.strip()

            schools.append({
                "School": school_name,
                "Street Address": street_address,
                "Street Zip": street_zip,
                "Latitude": latitude,
                "Longitude": longitude
            })
    
    return schools

def scrape_with_pagination(total_pages):
    all_schools = []

    for page_number in range(1, total_pages + 1):
        print(f"Scraping page {page_number}...")
        page_data = fetch_page_data(page_number)
        
        if not page_data:
            print(f"No data found on page {page_number}, skipping.")
            continue
        
        all_schools.extend(page_data)
        time.sleep(random.uniform(1, 3))

    return all_schools

total_pages = 5
scraped_data = scrape_with_pagination(total_pages)

for school in scraped_data:
    print(school)


In [None]:
import pandas as pd

df = pd.read_csv('output1.csv')

df['Latitude'] = pd.to_numeric(df['Latitude'], errors='coerce')
df['Longitude'] = pd.to_numeric(df['Longitude'], errors='coerce')

df = df.dropna(subset=['Latitude', 'Longitude'])

df.to_csv('output1.csv', index=False)

print("Data cleaned and saved back to the CSV.")


Data cleaned and saved back to the CSV.


In [None]:
!pip install shapely
import pandas as pd
from shapely.geometry import Point, Polygon

df = pd.read_csv('output1.csv') 
print(df.columns)

LA_POLYGON = Polygon([
    (-118.67, 34.34),  
    (-118.15, 34.34),  
    (-118.15, 33.7),   
    (-118.67, 33.7),   
    (-118.67, 34.34)   
])

def is_in_LA(lat, lon):
    return LA_POLYGON.contains(Point(lon, lat))

filtered_df = df[df.apply(lambda row: is_in_LA(row['Latitude'], row['Longitude']), axis=1)]

filtered_df.to_csv('filtered_output1.csv', index=False)

print("Filtered CSV saved.")


Index(['Record Type', 'County', 'School', 'Latitude', 'Longitude',
       'Street Address', 'Street City', 'Street State', 'Street Zip',
       'Mailing Address', 'Phone'],
      dtype='object')
Filtered CSV saved.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [57]:
print(df[['Latitude', 'Longitude']].head(50))


                Latitude             Longitude
0              34.050103           -118.260519
1              34.031953           -118.266279
2                No Data               No Data
3              33.878924           -118.071286
4                No Data               No Data
5              33.878552           -118.071161
6              33.875048           -118.078203
7   Information Redacted  Information Redacted
8              33.835176           -118.083725
9              33.845962           -118.074404
10               No Data               No Data
11             33.855106           -118.089062
12  Information Redacted  Information Redacted
13             33.868839           -118.086673
14               No Data               No Data
15               No Data               No Data
16             33.880705           -118.045567
17             33.856100           -118.068540
18             33.865877           -118.041993
19             33.865464           -118.064791
20  Informati