In [26]:
#import libraries

import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup as soup

In [27]:
# Set URL as a variable
url = 'https://nces.ed.gov/programs/digest/d01/dt078.asp'

# Use requests.post becuase it'll pull data after the webpage has loaded
r=requests.post(url)

# create a "soup" object 
data = soup(r.text, 'html.parser')

In [28]:
# Identify element tag with target data
data.select('table td pre')[0]

<pre style="text-align:left; padding-left:4px">
_________________________________________________________________________________________________________________________________________________________
                      |                                                               |                                                       | Percent
                      |                    Current dollars                            |                 Constant 2000-01 dollars              | change,
                      |                                                               |                                                       |1989-90 to
        State         |_______________________________________________________________|_______________________________________________________|2000-01 in
                      |       |        |        |        |        |         |         |        |        |        |        |        |          |constant
                      |1969-70|197

In [29]:
# Our data of interest is the text that is formatted like a table
# Select that data and return as text
doi= data.select('table td pre')[0].text
print(doi)


_________________________________________________________________________________________________________________________________________________________
                      |                                                               |                                                       | Percent
                      |                    Current dollars                            |                 Constant 2000-01 dollars              | change,
                      |                                                               |                                                       |1989-90 to
        State         |_______________________________________________________________|_______________________________________________________|2000-01 in
                      |       |        |        |        |        |         |         |        |        |        |        |        |          |constant
                      |1969-70|1979-80 |1989-90 |1995-96 |1998-99 |1999-2000| 200

In [30]:
# read_csv was taking in object as a file name
# trouble shooting said to use io.StringIO
#  

import io

salary_df = pd.read_csv(io.StringIO(doi),sep='|', #separate columns by |-symbol
            skiprows=7,
            usecols=list(range(0,8))
            )
salary_df

Unnamed: 0,Unnamed: 1,1969-70,1979-80,1989-90,1995-96,1998-99,1999-2000,2000-01
0,______________________,_______,________,________,________,________,_________,_________
1,1,2,3,4,5,6,7,8
2,______________________,_______,________,________,________,________,_________,_________
3,United States .....,"$8,626","$15,970","$31,367","$37,642","$40,580","$41,724","$42,898"
4,,_______,________,________,________,________,_________,_________
...,...,...,...,...,...,...,...,...
61,Washington ...........,9225,18820,30457,37853,38687,41013,42101
62,West Virginia.........,7650,13710,22842,32155,34244,35008,35764
63,Wisconsin ............,8963,16006,31921,38182,40657,41153,41646
64,Wyoming ..............,8232,16012,28141,31571,33500,34140,34189


### Clean the df

In [31]:
# Rename first column
salary_df.rename(columns={salary_df.columns[0]:'State'}, inplace=True)

salary_df.columns

Index(['State', '1969-70', '1979-80 ', '1989-90 ', '1995-96 ', '1998-99 ',
       '1999-2000', ' 2000-01 '],
      dtype='object')

In [32]:
# Remove white space from other column names

# Create an empty dicitonary
new_column_names = {}

# Loop thru column names and remove white space
for i in list(range(len(salary_df.columns))):
    new_column_names[salary_df.columns[i]] = salary_df.columns[i].strip()

salary_df.rename(columns=new_column_names, inplace=True)

salary_df.head()

Unnamed: 0,State,1969-70,1979-80,1989-90,1995-96,1998-99,1999-2000,2000-01
0,______________________,_______,________,________,________,________,_________,_________
1,1,2,3,4,5,6,7,8
2,______________________,_______,________,________,________,________,_________,_________
3,United States .....,"$8,626","$15,970","$31,367","$37,642","$40,580","$41,724","$42,898"
4,,_______,________,________,________,________,_________,_________


In [33]:
# remove '...' from state name column and then set as index
def period_remover(value):
    
    temp = value.strip()
    
    if len(temp) > 1 and temp[0].lower() in ('abcdefghijklmnopqrstuvwxyz'):
        return temp.split('.')[0].strip()
    else:
        return np.NaN

salary_df_clean = salary_df.copy()

salary_df_clean['State'] = salary_df_clean['State'].apply(period_remover)

salary_df_clean

Unnamed: 0,State,1969-70,1979-80,1989-90,1995-96,1998-99,1999-2000,2000-01
0,,_______,________,________,________,________,_________,_________
1,,2,3,4,5,6,7,8
2,,_______,________,________,________,________,_________,_________
3,United States,"$8,626","$15,970","$31,367","$37,642","$40,580","$41,724","$42,898"
4,,_______,________,________,________,________,_________,_________
...,...,...,...,...,...,...,...,...
61,Washington,9225,18820,30457,37853,38687,41013,42101
62,West Virginia,7650,13710,22842,32155,34244,35008,35764
63,Wisconsin,8963,16006,31921,38182,40657,41153,41646
64,Wyoming,8232,16012,28141,31571,33500,34140,34189


In [34]:
cleaner_salary_df = salary_df_clean.dropna(axis=0)
cleaner_salary_df.head(10)

Unnamed: 0,State,1969-70,1979-80,1989-90,1995-96,1998-99,1999-2000,2000-01
3,United States,"$8,626","$15,970","$31,367","$37,642","$40,580","$41,724","$42,898"
5,Alabama,6818,13060,24828,31313,35820,36689,37956
6,Alaska,10560,27210,43153,49620,46845,47262,46986
7,Arizona,8711,15054,29402,32483,35025,35650,36302
8,Arkansas,6307,12299,22352,29533,32330,33386,34476
9,California,10315,18020,37998,42259,46129,47680,48923
11,Colorado,7761,16205,30758,35364,38157,38163,39284
12,Connecticut,9262,16229,40461,50254,51424,51780,52100
13,Delaware,9015,16148,33377,40533,43164,44435,47047
14,District of Columbia,10285,22190,38402,43700,47076,47076,48651


In [35]:
cleaner_salary_df.tail(5)

Unnamed: 0,State,1969-70,1979-80,1989-90,1995-96,1998-99,1999-2000,2000-01
60,Virginia,8070,14060,30938,34792,37266,38123,40197
61,Washington,9225,18820,30457,37853,38687,41013,42101
62,West Virginia,7650,13710,22842,32155,34244,35008,35764
63,Wisconsin,8963,16006,31921,38182,40657,41153,41646
64,Wyoming,8232,16012,28141,31571,33500,34140,34189


In [36]:
def data_to_int(val):
    if val == '---':
        return np.NaN
    else:
         return int(''.join(re.findall(r'\d',val.split('\\')[-1])))

In [37]:
data_to_int('11,001')

11001

In [38]:
salary_df_final = cleaner_salary_df.copy()

for i in list(range(len(salary_df_final.columns)-1)):
    salary_df_final[salary_df_final.columns[i+1]] = salary_df_final[salary_df_final.columns[i+1]].apply(data_to_int)

In [39]:
salary_df_final

Unnamed: 0,State,1969-70,1979-80,1989-90,1995-96,1998-99,1999-2000,2000-01
3,United States,8626,15970,31367,37642,40580,41724,42898
5,Alabama,6818,13060,24828,31313,35820,36689,37956
6,Alaska,10560,27210,43153,49620,46845,47262,46986
7,Arizona,8711,15054,29402,32483,35025,35650,36302
8,Arkansas,6307,12299,22352,29533,32330,33386,34476
9,California,10315,18020,37998,42259,46129,47680,48923
11,Colorado,7761,16205,30758,35364,38157,38163,39284
12,Connecticut,9262,16229,40461,50254,51424,51780,52100
13,Delaware,9015,16148,33377,40533,43164,44435,47047
14,District of Columbia,10285,22190,38402,43700,47076,47076,48651


In [40]:
salary_df_final.to_csv('Data/2001_avg_teacher_salaries.csv', index=False)

In [None]:
# 2006 and on can use pd.read_html 

# For everything before that: pd.read_csv

# 1995 has a unique url ('https://nces.ed.gov/programs/digest/d95/dtab077.asp')

# Table 77 for 1996 ('https://nces.ed.gov/programs/digest/d96/d96t077.asp')
# Table 77 for 2005 ('https://nces.ed.gov/programs/digest/d05/tables/dt05_077.asp')

# Table 78 for 1997 ('https://nces.ed.gov/programs/digest/d97/d97t078.asp')
# Table 78 for 2000-04 ('https://nces.ed.gov/programs/digest/d00/dt078.asp')

#Table 79 in 1998,99 ('https://nces.ed.gov/programs/digest/d98/d98t079.asp')

for year in range(1995,2013):
    print(year)
    try:
        url = f'https://nces.ed.gov/programs/digest/d{str(year)[2:]}/d{str(year)[2:]}t077.asp'
        r=requests.post(url)
        data = soup(r.text, 'html.parser')

        table_list = data.select('table td pre')

        print(len(table_list))   
    except:
        print(f'No data in this link: {url}')

In [41]:
#import libraries

import io
import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup as soup

def table_scraper(url):
    # ----------------------------------------------------------------------------------------------------------------------
    
    # Use requests.post becuase it'll pull data after the webpage has loaded
    r=requests.post(url)

    # create a "soup" object 
    data = soup(r.text, 'html.parser')

    # ----------------------------------------------------------------------------------------------------------------------

    # Our data of interest is the text that is formatted like a table
    # Select that data and return as text
    doi= data.select('table td pre')[0].text

    # ----------------------------------------------------------------------------------------------------------------------

    salary_df = pd.read_csv(io.StringIO(doi),sep='|', #separate columns by |-symbol
                skiprows=6,
                usecols=list(range(0,8))
                )

    # ----------------------------------------------------------------------------------------------------------------------

    # Rename first column
    salary_df.rename(columns={salary_df.columns[0]:'State'}, inplace=True)

    # ----------------------------------------------------------------------------------------------------------------------

    # Remove white space from other column names

    # Create an empty dicitonary
    new_column_names = {}

    # Loop thru column names and remove white space
    for i in list(range(len(salary_df.columns))):
        new_column_names[salary_df.columns[i]] = salary_df.columns[i].strip()

    salary_df.rename(columns=new_column_names, inplace=True)

    # ----------------------------------------------------------------------------------------------------------------------

    # remove '...' from state name column and then set as index
    def period_remover(value):
        
        temp = value.strip()
        
        if len(temp) > 1 and temp[0].lower() in ('abcdefghijklmnopqrstuvwxyz'):
            return temp.split('.')[0].strip()
        else:
            return np.NaN

    salary_df_clean = salary_df.copy()

    salary_df_clean['State'] = salary_df_clean['State'].apply(period_remover)

    # ----------------------------------------------------------------------------------------------------------------------

    cleaner_salary_df = salary_df_clean.dropna(axis=0)

    # ----------------------------------------------------------------------------------------------------------------------

    def data_to_int(val):
        return int(''.join(re.findall(r'\d',val.split('\\')[-1])))

    salary_df_final = cleaner_salary_df.copy()

    for i in list(range(len(salary_df_final.columns)-1)):
        salary_df_final[salary_df_final.columns[i+1]] = salary_df_final[salary_df_final.columns[i+1]].apply(data_to_int)

    return salary_df_final

In [42]:
df_2002 = table_scraper('https://nces.ed.gov/programs/digest/d02/dt078.asp')
df_2002

Unnamed: 0,State,1969-70,1979-80,1989-90,1995-96,1999-2000,2000-01,2001-02
3,United States,8626,15970,31367,37642,41754,43335,44604
5,Alabama,6818,13060,24828,31313,36689,37956,39268
6,Alaska,10560,27210,43153,49620,46462,48123,49418
7,Arizona,8711,15054,29402,32483,35650,36302,36966
8,Arkansas,6307,12299,22352,29533,33386,34641,35389
9,California,10315,18020,37998,42259,47680,52480,53870
11,Colorado,7761,16205,30758,35364,38163,39184,40222
12,Connecticut,9262,16229,40461,50254,51780,52693,54300
13,Delaware,9015,16148,33377,40533,44435,47047,48363
14,District of Columbia,10285,22190,38402,43700,47076,48704,47049


In [43]:
df_2002.to_csv('Data/2002_avg_teacher_salaries.csv', index=False)

In [44]:
df_2003 = table_scraper('https://nces.ed.gov/programs/digest/d03/tables/dt078.asp')
df_2003

Unnamed: 0,State,1969-70,1979-80,1989-90,1999-2000,2000-01,2001-02,2002-03
3,United States,8626,15970,31367,41827,43400,44683,45822
5,Alabama,6818,13060,24828,36689,37069,37194,38246
6,Alaska,10560,27210,43153,46462,48123,49418,49685
7,Arizona,8711,15054,29402,36902,37167,39973,40894
8,Arkansas,6307,12299,22352,33386,34641,36962,37753
9,California,10315,18020,37998,47680,52480,54348,56283
11,Colorado,7761,16205,30758,38163,39184,40659,41275
12,Connecticut,9262,16229,40461,51780,52693,53551,54362
13,Delaware,9015,16148,33377,44435,47047,48363,50772
14,District of Columbia,10285,22190,38402,47076,48704,47049,50763


In [45]:
df_2003.to_csv('Data/2003_avg_teacher_salaries.csv', index=False)

In [46]:
df_2004 = table_scraper('https://nces.ed.gov/programs/digest/d04/tables/dt04_078.asp')
df_2004

Unnamed: 0,State,1969-70,1979-80,1989-90,1999-200,2000-01,2001-02,2002-03
3,United States,8626,15970,31367,41827,43400,44683,45822
5,Alabama,6818,13060,24828,36689,37069,37194,38246
6,Alaska,10560,27210,43153,46462,48123,49418,49685
7,Arizona,8711,15054,29402,36902,37167,39973,40894
8,Arkansas,6307,12299,22352,33386,34641,36962,37753
9,California,10315,18020,37998,47680,52480,54348,56283
11,Colorado,7761,16205,30758,38163,39184,40659,41275
12,Connecticut,9262,16229,40461,51780,52693,53551,54362
13,Delaware,9015,16148,33377,44435,47047,48363,50772
14,District of Columbia,10285,22190,38402,47076,48704,47049,50763


In [47]:
df_2004.to_csv('Data/2004_avg_teacher_salaries.csv', index=False)