# Scraping the Gas Price table
### Price is updated daily

In [1]:
#dependencies
import pandas as pd
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import requests

# Get Abbreviations by state

In [2]:
#Decided to find a website to scrape the state abbreviations
url_abv = "https://abbreviations.yourdictionary.com/articles/state-abbrev.html"

In [3]:
response = requests.get(url_abv)
soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
#HTML inspect found the values in the li tag
results = soup.find_all('li')

In [5]:
# total length of li
len(results)

199

In [6]:
#strip out only the first 50 li tags that contain the abbreviations
ABV = []

In [7]:
for i in range(0, 50):
    results[i].text
    ABV.append(results[i].text)

In [8]:
#Take the new lists and seperate the abbreviations from state names into a new lists
ABV_new = []
State_name = []

In [9]:
#abbreviation list
for i in range(0, 50):
    ABV_clean = ABV[i].split(' - ')[1]
    ABV_new.append(ABV_clean.replace(' ',''))

In [10]:
#state name list
for i in range(0, 50):
    ABV_clean = ABV[i].split(' - ')[0]
    State_name.append(ABV_clean.replace(' ',''))

In [11]:
#create dataframe
All_States_df = pd.DataFrame({'State':State_name,
                           'Abbreviation':ABV_new                   
})

In [12]:
All_States_df.head()

Unnamed: 0,State,Abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


# Getting the Gas Prices per State

In [13]:
#Appened the abbriviations list to the URL to get all 50 states url
url = "https://gasprices.aaa.com/?state="

GasPrice_State_urls = []

for i in range(0, 50):
    GasPrice_State_urls.append(url+ABV_new[i])

In [14]:
Scraped_df = []
Final_df = pd.DataFrame()

In [15]:
for i in range(0, 50):
    agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36\
    (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
    
    url = GasPrice_State_urls[i]
    request = Request(url, headers={'User-Agent': agent})
    html = urlopen(request).read().decode()
    tables = pd.read_html(html)
    
    df = tables[0]
    df["state"] = ABV_new[i]
    
    Scraped_df.append(df)
    print(ABV_new[i])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [16]:
for i in range(0,49):
    Final_df = Final_df.append(Scraped_df[i])

In [17]:
Final_df = Final_df.rename(columns={'Unnamed: 0': 'Averages',
                                    'Mid-Grade':'MidGrade'
                                   })

In [19]:
Final_df.set_index('state').head()

Unnamed: 0_level_0,Averages,Regular,MidGrade,Premium,Diesel
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AL,Current Avg.,$2.204,$2.487,$2.804,$2.833
AL,Yesterday Avg.,$2.171,$2.463,$2.782,$2.829
AL,Week Ago Avg.,$2.090,$2.389,$2.702,$2.815
AL,Month Ago Avg.,$1.990,$2.297,$2.616,$2.790
AL,Year Ago Avg.,$2.281,$2.589,$2.890,$2.777


# SQL connection

In [20]:
import pymysql
pymysql.install_as_MySQLdb()

from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

from sqlalchemy import Column, Integer, String, Float

from sqlalchemy import create_engine

In [21]:
dbuser = 'root'
dbpassword = 'Rosenhyf1!'
dbhost = 'localhost'
dbport = '3306'
dbname= 'etl_db'

engine = create_engine(f"mysql://{dbuser}:{dbpassword}@{dbhost}:{dbport}/{dbname}")
Base.metadata.create_all(engine)

In [22]:
from sqlalchemy.orm import Session
session = Session(bind=engine)

In [23]:
Final_df.to_sql('gasprices', engine, index=True, if_exists='replace')

In [24]:
All_States_df.to_sql('states', engine, index=True, if_exists='replace')