In [1]:
#import libraries

import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup as soup

In [2]:
# Set URL as a variable
url = 'https://nces.ed.gov/programs/digest/d05/tables/dt05_077.asp'

# Use requests.post becuase it'll pull data after the webpage has loaded
r=requests.post(url)

# create a "soup" object 
data = soup(r.text, 'html.parser')

In [3]:
# Identify element tag with target data
data.select('table td pre')[0]

<pre style="text-align:left; padding-left:4px">_____________________________________________________________________________________________________________________________________________
                      |                                                         |                                                 |   Percent
                      |                                                         |                                                 |   change,
                      |                          Current dollars                |             Constant 2004-05 dollars\1\         |1989-90 to
                      |_________________________________________________________|_________________________________________________|2004-05 in
                      |       |       |       |         |       |       |       |       |       |       |         |       |       |  constant
State                 |1969-70|1979-80|1989-90|1999-2000|2002-03|2003-04|2004-05|1969-70|1979-80|1989

In [4]:
# Our data of interest is the text that is formatted like a table
# Select that data and return as text
doi= data.select('table td pre')[0].text
print(doi)

_____________________________________________________________________________________________________________________________________________
                      |                                                         |                                                 |   Percent
                      |                                                         |                                                 |   change,
                      |                          Current dollars                |             Constant 2004-05 dollars\1\         |1989-90 to
                      |_________________________________________________________|_________________________________________________|2004-05 in
                      |       |       |       |         |       |       |       |       |       |       |         |       |       |  constant
State                 |1969-70|1979-80|1989-90|1999-2000|2002-03|2003-04|2004-05|1969-70|1979-80|1989-90|1999-2000|2002-03|2003-04|   dollars
______

In [5]:
# read_csv was taking in object as a file name
# trouble shooting said to use io.StringIO
#  

import io

salary_df = pd.read_csv(io.StringIO(doi),sep='|', #separate columns by |-symbol
            skiprows=6,
            usecols=list(range(0,8))
            )
salary_df

Unnamed: 0,State,1969-70,1979-80,1989-90,1999-2000,2002-03,2003-04,2004-05
0,______________________,_______,_______,_______,_________,_______,_______,_______
1,1,2,3,4,5,6,7,8
2,______________________,_______,_______,_______,_________,_______,_______,_______
3,United States .....,"$8,626","$15,970","$31,367","$41,807","$45,776","$46,752","$47,750"
4,,_______,_______,_______,_________,_______,_______,_______
...,...,...,...,...,...,...,...,...
61,Washington ...........,9225,18820,30457,41043,44949,45434,45712
62,West Virginia ........,7650,13710,22842,35009,38508,38461,38360
63,Wisconsin ............,8963,16006,31921,41153,42871,42882,43466
64,Wyoming ..............,8232,16012,28141,34127,37876,39532,40392


### Clean the df

In [6]:
# Rename first column
salary_df.rename(columns={salary_df.columns[0]:'State'}, inplace=True)

salary_df.columns

Index(['State', '1969-70', '1979-80', '1989-90', '1999-2000', '2002-03',
       '2003-04', '2004-05'],
      dtype='object')

In [7]:
# Remove white space from other column names

# Create an empty dicitonary
new_column_names = {}

# Loop thru column names and remove white space
for i in list(range(len(salary_df.columns))):
    new_column_names[salary_df.columns[i]] = salary_df.columns[i].strip()

salary_df.rename(columns=new_column_names, inplace=True)

salary_df.head()

Unnamed: 0,State,1969-70,1979-80,1989-90,1999-2000,2002-03,2003-04,2004-05
0,______________________,_______,_______,_______,_________,_______,_______,_______
1,1,2,3,4,5,6,7,8
2,______________________,_______,_______,_______,_________,_______,_______,_______
3,United States .....,"$8,626","$15,970","$31,367","$41,807","$45,776","$46,752","$47,750"
4,,_______,_______,_______,_________,_______,_______,_______


In [8]:
# remove '...' from state name column and then set as index
def period_remover(value):
    
    temp = value.strip()
    
    if len(temp) > 1 and temp[0].lower() in ('abcdefghijklmnopqrstuvwxyz'):
        return temp.split('.')[0].strip()
    else:
        return np.NaN

salary_df_clean = salary_df.copy()

salary_df_clean['State'] = salary_df_clean['State'].apply(period_remover)

salary_df_clean

Unnamed: 0,State,1969-70,1979-80,1989-90,1999-2000,2002-03,2003-04,2004-05
0,,_______,_______,_______,_________,_______,_______,_______
1,,2,3,4,5,6,7,8
2,,_______,_______,_______,_________,_______,_______,_______
3,United States,"$8,626","$15,970","$31,367","$41,807","$45,776","$46,752","$47,750"
4,,_______,_______,_______,_________,_______,_______,_______
...,...,...,...,...,...,...,...,...
61,Washington,9225,18820,30457,41043,44949,45434,45712
62,West Virginia,7650,13710,22842,35009,38508,38461,38360
63,Wisconsin,8963,16006,31921,41153,42871,42882,43466
64,Wyoming,8232,16012,28141,34127,37876,39532,40392


In [9]:
cleaner_salary_df = salary_df_clean.dropna(axis=0)
cleaner_salary_df.head(10)

Unnamed: 0,State,1969-70,1979-80,1989-90,1999-2000,2002-03,2003-04,2004-05
3,United States,"$8,626","$15,970","$31,367","$41,807","$45,776","$46,752","$47,750"
5,Alabama,6818,13060,24828,36689,38246,38325,38863
6,Alaska,10560,27210,43153,46462,49685,51736,52424
7,Arizona,8711,15054,29402,36902,40894,41843,42905
8,Arkansas,6307,12299,22352,33386,37753,39314,40495
9,California,10315,18020,37998,47680,56283,56444,57876
11,Colorado,7761,16205,30758,38163,41275,43319,44161
12,Connecticut,9262,16229,40461,51780,54362,57337,58688
13,Delaware,9015,16148,33377,44435,50772,49366,50869
14,District of Columbia,10285,22190,38402,47076,50763,57009,58456


In [10]:
def data_to_int(val):
    return int(''.join(re.findall(r'\d',val.split('\\')[-1])))

In [11]:
data_to_int('11,001')

11001

In [12]:
salary_df_final = cleaner_salary_df.copy()

for i in list(range(len(salary_df_final.columns)-1)):
    salary_df_final[salary_df_final.columns[i+1]] = salary_df_final[salary_df_final.columns[i+1]].apply(data_to_int)

In [13]:
salary_df_final

Unnamed: 0,State,1969-70,1979-80,1989-90,1999-2000,2002-03,2003-04,2004-05
3,United States,8626,15970,31367,41807,45776,46752,47750
5,Alabama,6818,13060,24828,36689,38246,38325,38863
6,Alaska,10560,27210,43153,46462,49685,51736,52424
7,Arizona,8711,15054,29402,36902,40894,41843,42905
8,Arkansas,6307,12299,22352,33386,37753,39314,40495
9,California,10315,18020,37998,47680,56283,56444,57876
11,Colorado,7761,16205,30758,38163,41275,43319,44161
12,Connecticut,9262,16229,40461,51780,54362,57337,58688
13,Delaware,9015,16148,33377,44435,50772,49366,50869
14,District of Columbia,10285,22190,38402,47076,50763,57009,58456


In [14]:
salary_df_final.to_csv('Data/2005_avg_teacher_salaries.csv', index=False)

In [None]:
# 2006 and on can use pd.read_html 

# For everything before that: pd.read_csv

# 1995 has a unique url ('https://nces.ed.gov/programs/digest/d95/dtab077.asp')

# Table 77 for 1996 ('https://nces.ed.gov/programs/digest/d96/d96t077.asp')
# Table 77 for 2005 ('https://nces.ed.gov/programs/digest/d05/tables/dt05_077.asp')

# Table 78 for 1997 ('https://nces.ed.gov/programs/digest/d97/d97t078.asp')
# Table 78 for 2000-04 ('https://nces.ed.gov/programs/digest/d00/dt078.asp')

#Table 79 in 1998,99 ('https://nces.ed.gov/programs/digest/d98/d98t079.asp')

for year in range(1995,2013):
    print(year)
    try:
        url = f'https://nces.ed.gov/programs/digest/d{str(year)[2:]}/d{str(year)[2:]}t077.asp'
        r=requests.post(url)
        data = soup(r.text, 'html.parser')

        table_list = data.select('table td pre')

        print(len(table_list))   
    except:
        print(f'No data in this link: {url}')

In [352]:
#import libraries

import io
import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup as soup

def table_scraper(url):
    # ----------------------------------------------------------------------------------------------------------------------
    
    # Use requests.post becuase it'll pull data after the webpage has loaded
    r=requests.post(url)

    # create a "soup" object 
    data = soup(r.text, 'html.parser')

    # ----------------------------------------------------------------------------------------------------------------------

    # Our data of interest is the text that is formatted like a table
    # Select that data and return as text
    doi= data.select('table td pre')[0].text

    # ----------------------------------------------------------------------------------------------------------------------

    salary_df = pd.read_csv(io.StringIO(doi),sep='|', #separate columns by |-symbol
                skiprows=6,
                usecols=list(range(0,8))
                )

    # ----------------------------------------------------------------------------------------------------------------------

    # Rename first column
    salary_df.rename(columns={salary_df.columns[0]:'State'}, inplace=True)

    # ----------------------------------------------------------------------------------------------------------------------

    # Remove white space from other column names

    # Create an empty dicitonary
    new_column_names = {}

    # Loop thru column names and remove white space
    for i in list(range(len(salary_df.columns))):
        new_column_names[salary_df.columns[i]] = salary_df.columns[i].strip()

    salary_df.rename(columns=new_column_names, inplace=True)

    # ----------------------------------------------------------------------------------------------------------------------

    # remove '...' from state name column and then set as index
    def period_remover(value):
        
        temp = value.strip()
        
        if len(temp) > 1 and temp[0].lower() in ('abcdefghijklmnopqrstuvwxyz'):
            return temp.split('.')[0]
        else:
            return np.NaN

    salary_df_clean = salary_df.copy()

    salary_df_clean['State'] = salary_df_clean['State'].apply(period_remover)

    # ----------------------------------------------------------------------------------------------------------------------

    cleaner_salary_df = salary_df_clean.dropna(axis=0)

    # ----------------------------------------------------------------------------------------------------------------------

    def data_to_int(val):
        return int(''.join(re.findall(r'\d',val.split('\\')[-1])))

    salary_df_final = cleaner_salary_df.copy()

    for i in list(range(len(salary_df_final.columns)-1)):
        salary_df_final[salary_df_final.columns[i+1]] = salary_df_final[salary_df_final.columns[i+1]].apply(data_to_int)

    return salary_df_final