In [1]:
import pandas as pd
import requests

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

GLOBAL_PATH = '../'
DATA_PATH = '../Data/'
OUTPUT_PATH = '../Output/'

## 1. Minimum Wage Data

In [38]:
url = 'https://www.laborlawcenter.com/state-minimum-wage-rates'
url = requests.get(url).text

tables = pd.read_html(url)
print(len(tables))

tables[0].head(3)

2


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,State,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,
1,Alabama,$7.25,$7.25,$7.25,$7.25,$7.25,$7.25,$7.25,$7.25,$7.25,$7.25,$7.25,TBD,Order Poster
2,Alaska,$7.75,$7.75,$8.75,$9.75,$9.80,$9.84,$9.89,$10.19,$10.34,$10.34,$10.85,TBD,Order Poster


In [31]:
df = tables[0]
# set the first row as the header
df.columns = df.iloc[0] 
# remove the first row and the last two columns
df = df.iloc[1:, :-3]

# rename the columns
df.columns = ['State', 'MinWage13', 'MinWage14', 'MinWage15', 'MinWage16', 'MinWage17', 
              'MinWage18', 'MinWage19', 'MinWage20', 'MinWage21', 'MinWage22']

# remove the dollar sign and convert to float
for col in df.columns[1:]:
    df[col] = df[col].str.replace('$', '', regex=True).astype(float)

df

Unnamed: 0,State,MinWage13,MinWage14,MinWage15,MinWage16,MinWage17,MinWage18,MinWage19,MinWage20,MinWage21,MinWage22
1,Alabama,7.25,7.25,7.25,7.25,7.25,7.25,7.25,7.25,7.25,7.25
2,Alaska,7.75,7.75,8.75,9.75,9.8,9.84,9.89,10.19,10.34,10.34
3,Arizona,7.8,7.9,8.05,8.05,10.0,10.5,11.0,12.0,12.15,12.8
4,Arkansas,7.25,7.25,7.5,8.0,8.5,8.5,9.25,10.0,11.0,11.0
5,California,8.0,9.0,9.0,10.0,10.5,11.0,12.0,13.0,14.0,15.0
6,Colorado,7.78,8.0,8.23,8.31,9.3,10.2,11.1,12.0,12.32,12.56
7,Connecticut,8.25,8.7,9.15,9.6,10.1,10.1,10.1,12.0,13.0,14.0
8,Delaware,7.25,7.75,8.25,8.25,8.25,8.25,8.75,9.25,9.25,10.5
9,District of Columbia,8.25,9.5,10.5,11.5,12.5,13.25,14.0,15.0,15.2,16.1
10,Florida,7.79,7.93,8.05,8.05,8.1,8.25,8.46,8.56,8.65,10.0


In [32]:
# export to csv
df.to_csv(DATA_PATH + 'minwage.csv', index=False)

## 2. GDP Data

In [2]:
file = DATA_PATH + 'SAGDP/' + 'SAGDP1__ALL_AREAS_1997_2021.csv'
df = pd.read_csv(file)

# remove the last 8 rows
df = df.iloc[8:, 1:]
# keep LineCode == 1
df = df[df['LineCode'] == 1]
# remove columns 2 to 6 and keep the first two columns and the rest
df = df.iloc[:, [0] + list(range(10, len(df.columns)))]

df.shape

df.head(3)

Unnamed: 0,GeoName,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
8,Alabama,157221.3,156853.2,160422.4,165134.7,176625.0,184369.5,187270.8,189002.5,186946.7,180707.2,184702.4,187605.8,189245.5,191369.8,189886.3,191335.2,194283.8,196974.9,200372.6,203432.7,199880.8,209979.3
16,Alaska,39406.6,40958.7,42979.0,42355.3,44055.0,45657.1,49190.2,51721.0,51252.1,56215.1,54601.5,55280.3,58283.6,55354.3,54188.2,54740.8,54246.6,54278.7,53327.0,53433.8,50705.2,50869.4
24,Arizona,208439.5,213166.2,220696.7,234065.9,244317.3,262326.0,274179.4,283250.7,280810.1,257498.5,260307.1,266101.5,271440.0,273481.9,276948.9,282577.0,291275.2,303606.1,314827.5,325395.3,327178.0,347656.0


In [3]:
# rename the columns
df.columns = ['State', 'GDP00', 'GDP01', 'GDP02', 'GDP03', 'GDP04', 'GDP05', 'GDP06',
                'GDP07', 'GDP08', 'GDP09', 'GDP10', 'GDP11', 'GDP12', 'GDP13', 'GDP14',
                'GDP15', 'GDP16', 'GDP17', 'GDP18', 'GDP19', 'GDP20', 'GDP21']

In [4]:
# convert float columns to from millions to units
for col in df.columns[1:]:
    df[col] = df[col] * 1000000
    

In [5]:
# save to csv
df.to_csv(DATA_PATH + 'gdp.csv', index=False)