In [1]:
#import libraries

import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup as soup

In [2]:
# Set URL as a variable
url = 'https://nces.ed.gov/programs/digest/d98/d98t079.asp'

# Use requests.post becuase it'll pull data after the webpage has loaded
r=requests.post(url)

# create a "soup" object 
data = soup(r.text, 'html.parser')

In [3]:
# Identify element tag with target data
data.select('table td pre')[0]

<pre style="text-align:left; padding-left:4px">
___________________________________________________________________________________________________________________________________________________________
                        |                                                                |                                                        | Percent
                        |                    Current dollars                             |                 Constant 1997-98 dollars\1\            | change,
                        |                                                                |                                                        |1979-80 to
          State         |________________________________________________________________|________________________________________________________|1997-98 in
                        |       |        |        |        |         |         |         |        |        |        |         |         |         |constant
            

In [4]:
# Our data of interest is the text that is formatted like a table
# Select that data and return as text
doi= data.select('table td pre')[0].text
print(doi)


___________________________________________________________________________________________________________________________________________________________
                        |                                                                |                                                        | Percent
                        |                    Current dollars                             |                 Constant 1997-98 dollars\1\            | change,
                        |                                                                |                                                        |1979-80 to
          State         |________________________________________________________________|________________________________________________________|1997-98 in
                        |       |        |        |        |         |         |         |        |        |        |         |         |         |constant
                        |1969-70|1979-80 |1989-90 |1994-95 

In [5]:
# read_csv was taking in object as a file name
# trouble shooting said to use io.StringIO
#  

import io

salary_df = pd.read_csv(io.StringIO(doi),sep='|', #separate columns by |-symbol
            skiprows=7,
            usecols=list(range(0,8))
            )
salary_df

Unnamed: 0,Unnamed: 1,1969-70,1979-80,1989-90,1994-95,1995-96,1996-97,1997-98
0,______________________,_______,________,________,________,_________,_________,_________
1,1,2,3,4,5,6,7,8
2,______________________,_______,________,________,________,_________,_________,_________
3,United States .....,"$8,626","$15,970","$31,367","$36,685","$37,716","$38,554","$39,385"
4,,_______,________,________,________,_________,_________,_________
...,...,...,...,...,...,...,...,...
61,Washington ...........,9225,18820,30457,36151,37853,37815,38788
62,West Virginia.........,7650,13710,22842,31944,32155,33257,33398
63,Wisconsin ............,8963,16006,31921,37746,38182,39057,39899
64,Wyoming ..............,8232,16012,28141,31285,31571,31715,32022


### Clean the df

In [6]:
# Rename first column
salary_df.rename(columns={salary_df.columns[0]:'State'}, inplace=True)

salary_df.columns

Index(['State', '1969-70', '1979-80 ', '1989-90 ', '1994-95 ', ' 1995-96 ',
       ' 1996-97 ', ' 1997-98 '],
      dtype='object')

In [7]:
# Remove white space from other column names

# Create an empty dicitonary
new_column_names = {}

# Loop thru column names and remove white space
for i in list(range(len(salary_df.columns))):
    new_column_names[salary_df.columns[i]] = salary_df.columns[i].strip()

salary_df.rename(columns=new_column_names, inplace=True)

salary_df.head()

Unnamed: 0,State,1969-70,1979-80,1989-90,1994-95,1995-96,1996-97,1997-98
0,______________________,_______,________,________,________,_________,_________,_________
1,1,2,3,4,5,6,7,8
2,______________________,_______,________,________,________,_________,_________,_________
3,United States .....,"$8,626","$15,970","$31,367","$36,685","$37,716","$38,554","$39,385"
4,,_______,________,________,________,_________,_________,_________


In [8]:
# remove '...' from state name column and then set as index
def period_remover(value):
    
    temp = value.strip()
    
    if len(temp) > 1 and temp[0].lower() in ('abcdefghijklmnopqrstuvwxyz'):
        return temp.split('.')[0].strip()
    else:
        return np.NaN

salary_df_clean = salary_df.copy()

salary_df_clean['State'] = salary_df_clean['State'].apply(period_remover)

salary_df_clean

Unnamed: 0,State,1969-70,1979-80,1989-90,1994-95,1995-96,1996-97,1997-98
0,,_______,________,________,________,_________,_________,_________
1,,2,3,4,5,6,7,8
2,,_______,________,________,________,_________,_________,_________
3,United States,"$8,626","$15,970","$31,367","$36,685","$37,716","$38,554","$39,385"
4,,_______,________,________,________,_________,_________,_________
...,...,...,...,...,...,...,...,...
61,Washington,9225,18820,30457,36151,37853,37815,38788
62,West Virginia,7650,13710,22842,31944,32155,33257,33398
63,Wisconsin,8963,16006,31921,37746,38182,39057,39899
64,Wyoming,8232,16012,28141,31285,31571,31715,32022


In [9]:
cleaner_salary_df = salary_df_clean.dropna(axis=0)
cleaner_salary_df.head(10)

Unnamed: 0,State,1969-70,1979-80,1989-90,1994-95,1995-96,1996-97,1997-98
3,United States,"$8,626","$15,970","$31,367","$36,685","$37,716","$38,554","$39,385"
5,Alabama,6818,13060,24828,31144,31313,32549,32818
6,Alaska,10560,27210,43153,47951,49171,50647,51738
7,Arizona,8711,15054,29402,32574,33300,33300,33850
8,Arkansas,6307,12299,22352,28934,29533,30319,30578
9,California,10315,18020,37998,41078,42259,42992,43725
11,Colorado,7761,16205,30758,34571,35364,36271,37052
12,Connecticut,9262,16229,40461,50045,50254,50426,50730
13,Delaware,9015,16148,33377,39076,40533,41436,42439
14,District of Columbia,10285,22190,38402,43700,43700,45012,46350


In [10]:
def data_to_int(val):
    return int(''.join(re.findall(r'\d',val.split('\\')[-1])))

In [11]:
data_to_int('11,001')

11001

In [12]:
salary_df_final = cleaner_salary_df.copy()

for i in list(range(len(salary_df_final.columns)-1)):
    salary_df_final[salary_df_final.columns[i+1]] = salary_df_final[salary_df_final.columns[i+1]].apply(data_to_int)

In [13]:
salary_df_final

Unnamed: 0,State,1969-70,1979-80,1989-90,1994-95,1995-96,1996-97,1997-98
3,United States,8626,15970,31367,36685,37716,38554,39385
5,Alabama,6818,13060,24828,31144,31313,32549,32818
6,Alaska,10560,27210,43153,47951,49171,50647,51738
7,Arizona,8711,15054,29402,32574,33300,33300,33850
8,Arkansas,6307,12299,22352,28934,29533,30319,30578
9,California,10315,18020,37998,41078,42259,42992,43725
11,Colorado,7761,16205,30758,34571,35364,36271,37052
12,Connecticut,9262,16229,40461,50045,50254,50426,50730
13,Delaware,9015,16148,33377,39076,40533,41436,42439
14,District of Columbia,10285,22190,38402,43700,43700,45012,46350


In [14]:
salary_df_final.to_csv('Data/1998_avg_teacher_salaries.csv',index=False)

In [None]:
# 2006 and on can use pd.read_html 

# For everything before that: pd.read_csv

# 1995 has a unique url ('https://nces.ed.gov/programs/digest/d95/dtab077.asp')

# Table 77 for 1996 ('https://nces.ed.gov/programs/digest/d96/d96t077.asp')
# Table 77 for 2005 ('https://nces.ed.gov/programs/digest/d05/tables/dt05_077.asp')

# Table 78 for 1997 ('https://nces.ed.gov/programs/digest/d97/d97t078.asp')
# Table 78 for 2000-04 ('https://nces.ed.gov/programs/digest/d00/dt078.asp')

#Table 79 in 1998,99 ('https://nces.ed.gov/programs/digest/d98/d98t079.asp')

for year in range(1995,2013):
    print(year)
    try:
        url = f'https://nces.ed.gov/programs/digest/d{str(year)[2:]}/d{str(year)[2:]}t077.asp'
        r=requests.post(url)
        data = soup(r.text, 'html.parser')

        table_list = data.select('table td pre')

        print(len(table_list))   
    except:
        print(f'No data in this link: {url}')

In [15]:
#import libraries

import io
import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup as soup

def table_scraper(url):
    # ----------------------------------------------------------------------------------------------------------------------
    
    # Use requests.post becuase it'll pull data after the webpage has loaded
    r=requests.post(url)

    # create a "soup" object 
    data = soup(r.text, 'html.parser')

    # ----------------------------------------------------------------------------------------------------------------------

    # Our data of interest is the text that is formatted like a table
    # Select that data and return as text
    doi= data.select('table td pre')[0].text

    # ----------------------------------------------------------------------------------------------------------------------

    salary_df = pd.read_csv(io.StringIO(doi),sep='|', #separate columns by |-symbol
                skiprows=7,
                usecols=list(range(0,8))
                )

    # ----------------------------------------------------------------------------------------------------------------------

    # Rename first column
    salary_df.rename(columns={salary_df.columns[0]:'State'}, inplace=True)

    # ----------------------------------------------------------------------------------------------------------------------

    # Remove white space from other column names

    # Create an empty dicitonary
    new_column_names = {}

    # Loop thru column names and remove white space
    for i in list(range(len(salary_df.columns))):
        new_column_names[salary_df.columns[i]] = salary_df.columns[i].strip()

    salary_df.rename(columns=new_column_names, inplace=True)

    # ----------------------------------------------------------------------------------------------------------------------

    # remove '...' from state name column and then set as index
    def period_remover(value):
        
        temp = value.strip()
        
        if len(temp) > 1 and temp[0].lower() in ('abcdefghijklmnopqrstuvwxyz'):
            return temp.split('.')[0]
        else:
            return np.NaN

    salary_df_clean = salary_df.copy()

    salary_df_clean['State'] = salary_df_clean['State'].apply(period_remover)

    # ----------------------------------------------------------------------------------------------------------------------

    cleaner_salary_df = salary_df_clean.dropna(axis=0)

    # ----------------------------------------------------------------------------------------------------------------------

    def data_to_int(val):
        return int(''.join(re.findall(r'\d',val.split('\\')[-1])))

    salary_df_final = cleaner_salary_df.copy()

    for i in list(range(len(salary_df_final.columns)-1)):
        salary_df_final[salary_df_final.columns[i+1]] = salary_df_final[salary_df_final.columns[i+1]].apply(data_to_int)

    return salary_df_final

In [17]:
table_scraper('https://nces.ed.gov/programs/digest/d99/d99t079.asp')

Unnamed: 0,State,1969-70,1979-80,1989-90,1994-95,1995-96,1996-97,1997-98
3,United States,8626,15970,31367,36685,37716,38554,39385
5,Alabama,6818,13060,24828,31144,31313,32549,32818
6,Alaska,10560,27210,43153,47951,49171,50647,51738
7,Arizona,8711,15054,29402,32574,33300,33300,33850
8,Arkansas,6307,12299,22352,28934,29533,30319,30578
9,California,10315,18020,37998,41078,42259,42992,43725
11,Colorado,7761,16205,30758,34571,35364,36271,37052
12,Connecticut,9262,16229,40461,50045,50254,50426,50730
13,Delaware,9015,16148,33377,39076,40533,41436,42439
14,District of Columbia,10285,22190,38402,43700,43700,45012,46350
