In [1]:
!pip install requests beautifulsoup4

import requests
import pandas as pd
import json

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time



# Select Schools and Start Scraping

In [2]:
def scrape_school_data():
    """Scrape school data directly from the CSV URL"""

    print("🔍 Scraping school data from catalogue.data.govt.nz...")

    # Direct CSV download URL
    csv_url = "https://catalogue.data.govt.nz/dataset/c1923d33-e781-46c9-9ea1-d9b850082be4/resource/4b292323-9fcc-41f8-814b-3c7b19cf14b3/download/schooldirectory-08-06-2025-074524.csv"

    try:
        # Download the CSV file
        print("📥 Downloading CSV data...")
        response = requests.get(csv_url, timeout=30)

        if response.status_code == 200:
            print(" Successfully downloaded!")

            # Read CSV into pandas DataFrame
            from io import StringIO
            df = pd.read_csv(StringIO(response.text))

            print(f"📊 Total schools in database: {len(df)}")
            print(f"📋 Available columns: {len(df.columns)}")

            return df
        else:
            print(f"❌ Failed to download. Status code: {response.status_code}")
            return None

    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return None

In [3]:
def find_our_schools(df):
    """Find the 3 target schools for uniform forecasting"""

    print("\n🎯 Searching for target schools...")

    # Target schools with correct apostrophes as they appear in the CSV
    target_schools = [
        'Auckland Grammar School',
        "Wellington Girls' College",  # Note the apostrophe
        "Christchurch Boys' High School"  # Note the apostrophe
    ]

    found_schools = []

    for school_name in target_schools:
        print(f"\n🔍 Looking for: {school_name}")

        # Search in the Org_Name column (organization name)
        mask = df['Org_Name'].str.contains(school_name, case=False, na=False)
        matches = df[mask]

        if len(matches) > 0:
            print(f"✅ Found it!")
            school_data = matches.iloc[0]  # Take first match
            found_schools.append(school_data)

            # Show basic info
            print(f"   📋 Official Name: {school_data['Org_Name']}")
            print(f"   📞 Phone: {school_data['Telephone']}")
            print(f"   🏫 Type: {school_data['Org_Type']}")

        else:
            print(f"❌ Not found")

    return found_schools


In [4]:
def extract_school_details(schools_data):
    """Extract the important information for forecasting"""

    print("\n📊 Extracting school details...")

    school_info = []

    for school in schools_data:

        # Extract key information using the correct column names
        # Use .get() to avoid KeyError if column doesn't exist
        info = {
            'School_Name': school.get('Org_Name', ''),
            'Phone': school.get('Telephone', ''),
            'Email': school.get('Email', ''),
            'Website': school.get('URL', ''),
            'Contact_Person': school.get('Contact1_Name', ''),
            'School_Type': school.get('Org_Type', ''),
            'Authority': school.get('Authority', ''),
            'Gender_Type': school.get('CoEd_Status', ''),
            'Address': school.get('Add1_Line1', ''),
            'Suburb': school.get('Add1_Suburb', ''),
            'City': school.get('Add1_City', ''),
            'Current_Roll': school.get('Total', 0),
            'European_Students': school.get('European', 0),
            'Maori_Students': school.get('Māori', 0),  # Try with Māori first
            'Pacific_Students': school.get('Pacific', 0),
            'Asian_Students': school.get('Asian', 0),
            'Roll_Date': school.get('Roll_Date', '')
        }

        # If Māori column doesn't work, try alternative names
        if info['Maori_Students'] == 0:
            info['Maori_Students'] = school.get('Maori', 0)

        school_info.append(info)

        # Display the extracted information
        print(f"\n🏫 {info['School_Name']}")
        print(f"   📞 Phone: {info['Phone']}")
        print(f"   📧 Email: {info['Email']}")
        print(f"   🌐 Website: {info['Website']}")
        print(f"   👨‍💼 Contact: {info['Contact_Person']}")
        print(f"   🏫 Type: {info['School_Type']}")
        print(f"   👫 Gender: {info['Gender_Type']}")
        print(f"   👥 Total Students: {info['Current_Roll']}")
        print(f"   📍 Location: {info['Address']}, {info['City']}")

    return school_info

In [5]:
def analyze_for_forecasting(school_info):
    """Analyze how this data improves demand forecasting"""

    print(f"\n🔗 FORECASTING ANALYSIS:")
    print("="*30)

    for school in school_info:
        print(f"\n📊 {school['School_Name']}:")

        # Student roll analysis
        roll = school['Current_Roll']
        if pd.notna(roll):
            if roll > 2000:
                size_cat = "Very Large"
                demand_multiplier = 1.8
            elif roll > 1200:
                size_cat = "Large"
                demand_multiplier = 1.4
            elif roll > 600:
                size_cat = "Medium"
                demand_multiplier = 1.0
            else:
                size_cat = "Small"
                demand_multiplier = 0.7

            print(f"   👥 Roll: {int(roll):,} students ({size_cat})")
            print(f"   📈 Demand Multiplier: {demand_multiplier}x")

        # Gender analysis for product filtering
        gender = school['Gender_Type']
        if 'Boys' in str(gender):
            print(f"   👔 Boys School: Need boys uniforms only")
            boys_products = 100
            girls_products = 0
        elif 'Girls' in str(gender):
            print(f"   👗 Girls School: Need girls uniforms only")
            boys_products = 0
            girls_products = 100
        else:
            print(f"   👫 Co-ed School: Need both boys and girls uniforms")
            boys_products = 50
            girls_products = 50

        print(f"   📊 Product Mix: {boys_products}% boys, {girls_products}% girls")

In [6]:
print("🚀 Starting school data scraping...")

# Step 1: Download school data
schools_df = scrape_school_data()

if schools_df is not None:

    # Step 2: Find our target schools
    target_schools = find_our_schools(schools_df)

    if target_schools:

        # Step 3: Extract detailed information
        school_details = extract_school_details(target_schools)

        # Step 4: Analyze for forecasting
        analyze_for_forecasting(school_details)

        # Step 5: Save the results
        print(f"\n💾 Saving results...")

        # Convert to DataFrame and save
        result_df = pd.DataFrame(school_details)

        # Save to CSV
        result_df.to_csv('nz_school_data.csv', index=False)
        print("✅ Saved to: nz_school_data.csv")

        # Save to Excel
        result_df.to_excel('nz_school_data.xlsx', index=False)
        print("✅ Saved to: nz_school_data.xlsx")

        # Summary
        print(f"\n🎉 SCRAPING COMPLETED!")
        print(f"📊 Successfully scraped {len(school_details)} schools")
        print(f"📋 Data includes: roll numbers, contact info, gender type")

    else:
        print("❌ No target schools found")

else:
    print("❌ Failed to download school data")


🚀 Starting school data scraping...
🔍 Scraping school data from catalogue.data.govt.nz...
📥 Downloading CSV data...
 Successfully downloaded!
📊 Total schools in database: 2569
📋 Available columns: 52

🎯 Searching for target schools...

🔍 Looking for: Auckland Grammar School
✅ Found it!
   📋 Official Name: Auckland Grammar School
   📞 Phone: 09-6235400
   🏫 Type: Secondary (Year 9-15)

🔍 Looking for: Wellington Girls' College
✅ Found it!
   📋 Official Name: Wellington Girls' College
   📞 Phone: 04-4725743
   🏫 Type: Secondary (Year 9-15)

🔍 Looking for: Christchurch Boys' High School
✅ Found it!
   📋 Official Name: Christchurch Boys' High School
   📞 Phone: 03-3485003
   🏫 Type: Secondary (Year 9-15)

📊 Extracting school details...

🏫 Auckland Grammar School
   📞 Phone: 09-6235400
   📧 Email: admin@ags.school.nz
   🌐 Website: http://www.ags.school.nz
   👨‍💼 Contact: Timothy O'Connor
   🏫 Type: Secondary (Year 9-15)
   👫 Gender: Boys School
   👥 Total Students: 2755.0
   📍 Location: Mount

In [7]:
print(f"\n🔗 HOW TO USE THIS DATA WITH PART 1:")
print("="*40)

print("1. STUDENT ROLL → DEMAND SCALING:")
print("   - Large schools (1200+ students) = 1.4x demand")
print("   - Very Large schools (2000+ students) = 1.8x demand")

print("\n2. GENDER TYPE → PRODUCT FILTERING:")
print("   - Boys schools = 0% girls products")
print("   - Girls schools = 0% boys products")
print("   - Co-ed schools = 50/50 split")

print("\n3. CONTACT INFO → CUSTOMER RELATIONS:")
print("   - Direct phone/email for order coordination")
print("   - Website for uniform policy research")




🔗 HOW TO USE THIS DATA WITH PART 1:
1. STUDENT ROLL → DEMAND SCALING:
   - Large schools (1200+ students) = 1.4x demand
   - Very Large schools (2000+ students) = 1.8x demand

2. GENDER TYPE → PRODUCT FILTERING:
   - Boys schools = 0% girls products
   - Girls schools = 0% boys products
   - Co-ed schools = 50/50 split

3. CONTACT INFO → CUSTOMER RELATIONS:
   - Direct phone/email for order coordination
   - Website for uniform policy research
