In [1]:
#import libraries

import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup as soup

In [2]:
# Set URL as a variable
url = 'https://nces.ed.gov/programs/digest/d00/dt078.asp'

# Use requests.post becuase it'll pull data after the webpage has loaded
r=requests.post(url)

# create a "soup" object 
data = soup(r.text, 'html.parser')

In [3]:
# Identify element tag with target data
data.select('table td pre')[0]

<pre style="text-align:left; padding-left:4px">
_______________________________________________________________________________________________________________________________________________________________
                      |                         Current dollars                                                   |     Constant 1998-99 dollars\1\
                      |___________________________________________________________________________________________|____________________________________________
 State or other area  |       |       |          |       |        |           |           |           |           |        |        |        |        |
                      |1939-40|1949-50| 1959-60  |1969-70|1979-80 |   1989-90 |   1996-97 |   1997-98 |   1998-99 |1969-70 |1979-80 |1989-90 |1996-97 |1997-98
______________________|_______|_______|__________|_______|________|___________|___________|___________|___________|________|________|________|________|________
          1  

In [4]:
# Our data of interest is the text that is formatted like a table
# Select that data and return as text
doi= data.select('table td pre')[0].text
print(doi)


_______________________________________________________________________________________________________________________________________________________________
                      |                         Current dollars                                                   |     Constant 1998-99 dollars\1\
                      |___________________________________________________________________________________________|____________________________________________
 State or other area  |       |       |          |       |        |           |           |           |           |        |        |        |        |
                      |1939-40|1949-50| 1959-60  |1969-70|1979-80 |   1989-90 |   1996-97 |   1997-98 |   1998-99 |1969-70 |1979-80 |1989-90 |1996-97 |1997-98
______________________|_______|_______|__________|_______|________|___________|___________|___________|___________|________|________|________|________|________
          1           |   2   |   3   |     4    |   5   |  

In [5]:
# read_csv was taking in object as a file name
# trouble shooting said to use io.StringIO
#  

import io

salary_df = pd.read_csv(io.StringIO(doi),sep='|', #separate columns by |-symbol
            skiprows=5,
            usecols=list(range(0,10))
            )
salary_df

Unnamed: 0,Unnamed: 1,1939-40,1949-50,1959-60,1969-70,1979-80,1989-90,1996-97,1997-98,1998-99
0,______________________,_______,_______,__________,_______,________,___________,___________,___________,___________
1,1,2,3,4,5,6,7,8,9,10
2,______________________,_______,_______,__________,_______,________,___________,___________,___________,___________
3,United States .....,"$1,441","$3,010","$5,174","$9,047","$16,715","$32,638","\2\$40,435","\2\$41,272","$42,459"
4,,_______,_______,__________,_______,________,___________,___________,___________,___________
...,...,...,...,...,...,...,...,...,...,...
67,American Samoa .......,---,---,852,5130,---,---,---,---,---
68,Guam .................,---,---,4107,7800,---,---,---,---,---
69,Puerto Rico ..........,---,---,"\8\ 2,360",---,---,---,---,---,---
70,Virgin Islands .......,---,---,3407,---,---,---,---,---,---


### Clean the df

In [6]:
# Rename first column
salary_df.rename(columns={salary_df.columns[0]:'State'}, inplace=True)

salary_df.columns

Index(['State', '1939-40', '1949-50', ' 1959-60  ', '1969-70', '1979-80 ',
       '   1989-90 ', '   1996-97 ', '   1997-98 ', '   1998-99 '],
      dtype='object')

In [7]:
# Remove white space from other column names

# Create an empty dicitonary
new_column_names = {}

# Loop thru column names and remove white space
for i in list(range(len(salary_df.columns))):
    new_column_names[salary_df.columns[i]] = salary_df.columns[i].strip()

salary_df.rename(columns=new_column_names, inplace=True)

salary_df.head()

Unnamed: 0,State,1939-40,1949-50,1959-60,1969-70,1979-80,1989-90,1996-97,1997-98,1998-99
0,______________________,_______,_______,__________,_______,________,___________,___________,___________,___________
1,1,2,3,4,5,6,7,8,9,10
2,______________________,_______,_______,__________,_______,________,___________,___________,___________,___________
3,United States .....,"$1,441","$3,010","$5,174","$9,047","$16,715","$32,638","\2\$40,435","\2\$41,272","$42,459"
4,,_______,_______,__________,_______,________,___________,___________,___________,___________


In [8]:
# remove '...' from state name column and then set as index
def period_remover(value):
    
    temp = value.strip()
    
    if len(temp) > 1 and temp[0].lower() in ('abcdefghijklmnopqrstuvwxyz'):
        return temp.split('.')[0].strip()
    else:
        return np.NaN

salary_df_clean = salary_df.copy()

salary_df_clean['State'] = salary_df_clean['State'].apply(period_remover)

salary_df_clean

Unnamed: 0,State,1939-40,1949-50,1959-60,1969-70,1979-80,1989-90,1996-97,1997-98,1998-99
0,,_______,_______,__________,_______,________,___________,___________,___________,___________
1,,2,3,4,5,6,7,8,9,10
2,,_______,_______,__________,_______,________,___________,___________,___________,___________
3,United States,"$1,441","$3,010","$5,174","$9,047","$16,715","$32,638","\2\$40,435","\2\$41,272","$42,459"
4,,_______,_______,__________,_______,________,___________,___________,___________,___________
...,...,...,...,...,...,...,...,...,...,...
67,American Samoa,---,---,852,5130,---,---,---,---,---
68,Guam,---,---,4107,7800,---,---,---,---,---
69,Puerto Rico,---,---,"\8\ 2,360",---,---,---,---,---,---
70,Virgin Islands,---,---,3407,---,---,---,---,---,---


In [9]:
cleaner_salary_df = salary_df_clean.dropna(axis=0)
cleaner_salary_df.head(10)

Unnamed: 0,State,1939-40,1949-50,1959-60,1969-70,1979-80,1989-90,1996-97,1997-98,1998-99
3,United States,"$1,441","$3,010","$5,174","$9,047","$16,715","$32,638","\2\$40,435","\2\$41,272","$42,459"
5,Alabama,744,2111,4002,6954,13338,26200,33744,34040,36740
6,Alaska,---,---,6859,10993,27697,"\2\ 43,161","\2\ 52,033","\2\ 48,760",48085
7,Arizona,1544,3556,5590,8975,16180,33592,"\2\ 44,157","\2\ 44,819","\2\ 45,785"
8,Arkansas,584,1801,3295,6461,12704,23296,"\2\ 31,852",32186,"\2\ 32,879"
9,California,2351,---,"\2\ 6,600",10950,18626,"\2\ 39,309","\2\ 45,349","\2\ 45,610","\2\ 46,593"
11,Colorado,1393,2821,4997,8105,16840,31832,37473,"\2\ 38,590","\2\ 39,421"
12,Connecticut,1861,3558,6008,9597,16989,41888,52067,52480,53429
13,Delaware,1684,3273,"\2\ 5,800",9387,16845,34620,43085,44169,44916
14,District of Columbia,2350,3920,6280,10700,23027,43637,"\2\ 40,854","\2\ 42,068","\2\ 42,974"


In [10]:
cleaner_salary_df = cleaner_salary_df.drop(axis=0, index=66)

cleaner_salary_df.tail(5)

Unnamed: 0,State,1939-40,1949-50,1959-60,1969-70,1979-80,1989-90,1996-97,1997-98,1998-99
64,Wyoming,1169,2798,4937,8496,16830,29047,32620,32979,34683
67,American Samoa,---,---,852,5130,---,---,---,---,---
68,Guam,---,---,4107,7800,---,---,---,---,---
69,Puerto Rico,---,---,"\8\ 2,360",---,---,---,---,---,---
70,Virgin Islands,---,---,3407,---,---,---,---,---,---


In [11]:
def data_to_int(val):
    if val.strip() == '---':
        return np.NaN
    else:
        return int(''.join(re.findall(r'\d',val.split('\\')[-1])))

In [12]:
data_to_int('11,001')

11001

In [13]:
cleaner_salary_df

Unnamed: 0,State,1939-40,1949-50,1959-60,1969-70,1979-80,1989-90,1996-97,1997-98,1998-99
3,United States,"$1,441","$3,010","$5,174","$9,047","$16,715","$32,638","\2\$40,435","\2\$41,272","$42,459"
5,Alabama,744,2111,4002,6954,13338,26200,33744,34040,36740
6,Alaska,---,---,6859,10993,27697,"\2\ 43,161","\2\ 52,033","\2\ 48,760",48085
7,Arizona,1544,3556,5590,8975,16180,33592,"\2\ 44,157","\2\ 44,819","\2\ 45,785"
8,Arkansas,584,1801,3295,6461,12704,23296,"\2\ 31,852",32186,"\2\ 32,879"
9,California,2351,---,"\2\ 6,600",10950,18626,"\2\ 39,309","\2\ 45,349","\2\ 45,610","\2\ 46,593"
11,Colorado,1393,2821,4997,8105,16840,31832,37473,"\2\ 38,590","\2\ 39,421"
12,Connecticut,1861,3558,6008,9597,16989,41888,52067,52480,53429
13,Delaware,1684,3273,"\2\ 5,800",9387,16845,34620,43085,44169,44916
14,District of Columbia,2350,3920,6280,10700,23027,43637,"\2\ 40,854","\2\ 42,068","\2\ 42,974"


In [14]:
salary_df_final = cleaner_salary_df.copy()

for i in list(range(len(salary_df_final.columns)-1)):
    salary_df_final[salary_df_final.columns[i+1]] = salary_df_final[salary_df_final.columns[i+1]].apply(data_to_int)

In [15]:
salary_df_final

Unnamed: 0,State,1939-40,1949-50,1959-60,1969-70,1979-80,1989-90,1996-97,1997-98,1998-99
3,United States,1441.0,3010.0,5174,9047.0,16715.0,32638.0,40435.0,41272.0,42459.0
5,Alabama,744.0,2111.0,4002,6954.0,13338.0,26200.0,33744.0,34040.0,36740.0
6,Alaska,,,6859,10993.0,27697.0,43161.0,52033.0,48760.0,48085.0
7,Arizona,1544.0,3556.0,5590,8975.0,16180.0,33592.0,44157.0,44819.0,45785.0
8,Arkansas,584.0,1801.0,3295,6461.0,12704.0,23296.0,31852.0,32186.0,32879.0
9,California,2351.0,,6600,10950.0,18626.0,39309.0,45349.0,45610.0,46593.0
11,Colorado,1393.0,2821.0,4997,8105.0,16840.0,31832.0,37473.0,38590.0,39421.0
12,Connecticut,1861.0,3558.0,6008,9597.0,16989.0,41888.0,52067.0,52480.0,53429.0
13,Delaware,1684.0,3273.0,5800,9387.0,16845.0,34620.0,43085.0,44169.0,44916.0
14,District of Columbia,2350.0,3920.0,6280,10700.0,23027.0,43637.0,40854.0,42068.0,42974.0


In [16]:
salary_df_final.to_csv('Data/2000_avg_teacher_salaries.csv', index=False)