# Importing Pertinent Libraries

In [3]:
import pandas as pd
from sodapy import Socrata
from urllib.request import urlopen
from zipfile import ZipFile
import json, requests,  time
from bs4 import BeautifulSoup
from io import StringIO, BytesIO
import numpy as np
import io
import re

In [4]:
#importing API credintials

with open("/Users/Desmond/.secret/CDC_api.json") as f:
    data = [json.loads(line) for line in f]

#### NOTE: Be sure to change the file path above as necessary to run Socrata properly 

In [5]:
#assingning API client per instructions
client = Socrata("chronicdata.cdc.gov", None)



## Scrapping Historical Questionnaire Dictionaries

In [6]:
#pulling BRFSS Questions

results = client.get('iuq5-y9ct', limit=7000)

results_df = pd.DataFrame.from_records(results)

results_df.shape

results_df.head()

# saving results to CSV file

Unnamed: 0,topic,question,variablename,responses,year,type,displayorder
0,Health Status,Would you say that in general your health is:,GENHLTH,1=Excellent 2=Very good 3=Good 4=Fair 5=Poor 7...,2018,Core Question,1
1,Healthy Days — Health Related Quality of Life,"Now thinking about your physical health, which...",PHYSHLTH,1-30=Number of days 88=None 77=Don’t know/Not ...,2018,Core Question,2
2,Healthy Days — Health Related Quality of Life,"Now thinking about your mental health, which i...",MENTHLTH,1-30=Number of days 88=None 77=Don’t know/Not ...,2018,Core Question,3
3,Healthy Days — Health Related Quality of Life,"During the past 30 days, for about how many da...",POORHLTH,1-30=Number of days 88=None 77=Don’t know/Not ...,2018,Core Question,4
4,Health Care Access,"Do you have any kind of health care coverage, ...",HLTHPLN1,1=Yes 2=No 7=Don’t know/Not Sure 9=Refused,2018,Core Question,5


In [7]:
results_df.to_csv('BRFSS_Hist_Qs.csv', index=False)

## Scrapping Historical Questionnaire Data

In [8]:
#creating url links needed for data scrapping

urls = []

for x in range(2009,2019):
    
    #Survey data before 2012 are listed under old url formatting 'htm'. Reformating will be necessary for these years
    
    if x < 2012:
        
        urls.append(('http://www.cdc.gov/brfss/annual_data/annual_{}.htm').format(x))
        
    else:
        
        urls.append(('http://www.cdc.gov/brfss/annual_data/annual_{}.html').format(x))
        
urls

['http://www.cdc.gov/brfss/annual_data/annual_2009.htm',
 'http://www.cdc.gov/brfss/annual_data/annual_2010.htm',
 'http://www.cdc.gov/brfss/annual_data/annual_2011.htm',
 'http://www.cdc.gov/brfss/annual_data/annual_2012.html',
 'http://www.cdc.gov/brfss/annual_data/annual_2013.html',
 'http://www.cdc.gov/brfss/annual_data/annual_2014.html',
 'http://www.cdc.gov/brfss/annual_data/annual_2015.html',
 'http://www.cdc.gov/brfss/annual_data/annual_2016.html',
 'http://www.cdc.gov/brfss/annual_data/annual_2017.html',
 'http://www.cdc.gov/brfss/annual_data/annual_2018.html']

Now that we have our list of web addresses, we can extract the needed data in the form of zip files.

#### Isolating .zip files

In [9]:
# prospective zip files

options = []
  
for url in urls:
    
    response = requests.get(url)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    #earlier data was under different site formatting
    
    if url == 'http://www.cdc.gov/brfss/annual_data/annual_2011.htm':
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        _zip = soup.findAll('a', attrs={'href': re.compile('http:')})
        
        o = _zip[2].get('href')
        
        options.append(o)
        
    elif '.html' not in url:
        
        _zip = soup.findAll('a', attrs={'href': re.compile('http:')})
        
        o = _zip[1].get('href')
        
        options.append(o)
        
    else:
        
        _zip = soup.findAll('div', class_='col-md-6')
        
        for x in _zip[2].findAll('a'):
            
            options.append(x.get('href'))
            
        options.append(o)
           
options = list(set(options))

options

['http://www.cdc.gov/brfss/annual_data/2012/files/LLCP2012XPT.ZIP',
 'http://www.cdc.gov/brfss/annual_data/2014/files/LLCP2014XPT.ZIP',
 '/brfss/annual_data/2017/files/LLCP2017ASC.zip',
 '/brfss/annual_data/2017/files/LLCP2017XPT.zip',
 'https://www.cdc.gov/brfss/annual_data/2015/files/LLCP2015ASC.zip',
 '/brfss/annual_data/2016/files/LLCP2016XPT.zip',
 '/brfss/annual_data/2012/LLCP_VarLayout_12_OneColumn.HTML',
 'http://www.cdc.gov/brfss/annual_data/2012/files/LLCP2012ASC.ZIP',
 'http://www.cdc.gov/brfss/annual_data/2013/files/LLCP2013ASC.ZIP',
 '/brfss/annual_data/2016/llcp_varlayout_16_onecolumn.html',
 'https://www.cdc.gov/brfss/annual_data/2015/llcp_varlayout_15_onecolumn.html',
 'http://www.cdc.gov/brfss/annual_data/2011/files/LLCP2011XPT.ZIP',
 '/brfss/annual_data/2016/files/LLCP2016ASC.zip',
 'http://www.cdc.gov/brfss/annual_data/2009/files/CDBRFS09XPT.ZIP',
 'http://www.cdc.gov/brfss/annual_data/2013/files/LLCP2013XPT.ZIP',
 'http://www.cdc.gov/brfss/annual_data/2010/files/CDB

As you can see, our zip file list inlcudes a lot of useless info. We need to isolate the purtenant zip files.

In [10]:
#target XPT files to download
z_files = []

#removing needless files
for option in options:
    
    if 'XPT' not in option:
        
        pass
    
    elif 'out'in option:
        
        pass
    
    elif 'https' in option:
        
        option = option.replace('https', 'http')
        
        z_files.append(option)
        
    elif 'http://www.cdc.gov' not in option:
        
        z = 'http://www.cdc.gov' + option
        
        z_files.append(z)    
        
    else:
        
        z_files.append(option)
    
z_files

['http://www.cdc.gov/brfss/annual_data/2012/files/LLCP2012XPT.ZIP',
 'http://www.cdc.gov/brfss/annual_data/2014/files/LLCP2014XPT.ZIP',
 'http://www.cdc.gov/brfss/annual_data/2017/files/LLCP2017XPT.zip',
 'http://www.cdc.gov/brfss/annual_data/2016/files/LLCP2016XPT.zip',
 'http://www.cdc.gov/brfss/annual_data/2011/files/LLCP2011XPT.ZIP',
 'http://www.cdc.gov/brfss/annual_data/2009/files/CDBRFS09XPT.ZIP',
 'http://www.cdc.gov/brfss/annual_data/2013/files/LLCP2013XPT.ZIP',
 'http://www.cdc.gov/brfss/annual_data/2010/files/CDBRFS10XPT.zip',
 'http://www.cdc.gov/brfss/annual_data/2018/files/LLCP2018XPT.zip',
 'http://www.cdc.gov/brfss/annual_data/2015/files/LLCP2015XPT.zip']

Now that the appropriate zip files names are generated, we can access and save the information within each file.

In [None]:
#downloading opened zip files to notebook

for url in z_files:
    
    #opening zip file
    zipresp = urlopen(url)
    
    with urlopen(url) as zipresp:
        
        #reading and extracting zip file data
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            
            zfile.extractall('C:/Users/Desmond/Course-Work/Capstone_Project/dsc-capstone-project-v2-online-ds-pt-051319')

#### 

Now that we have our raw data, we need to create csv files to store them.

#### Saveing raw data and csv conversion

In [12]:
#generating raw data file names
local_files = []

for url in z_files:
    
    local_files.append(url[48:56])
    
local_files

['LLCP2012',
 'LLCP2014',
 'LLCP2017',
 'LLCP2016',
 'LLCP2011',
 'CDBRFS09',
 'LLCP2013',
 'CDBRFS10',
 'LLCP2018',
 'LLCP2015']

In [13]:
#generating csv file names
study_years = range(2009, 2019)

call_files = []

for s in study_years:
    
    call_files.append(('BRFSS_{}').format(s))

In [14]:
call_files

['BRFSS_2009',
 'BRFSS_2010',
 'BRFSS_2011',
 'BRFSS_2012',
 'BRFSS_2013',
 'BRFSS_2014',
 'BRFSS_2015',
 'BRFSS_2016',
 'BRFSS_2017',
 'BRFSS_2018']

In [None]:
counter = 0

for l_file in local_files:
    
    for file in call_files:
        
        if file[8:10] in l_file:
            
            holder = pd.read_sas(('Your_File_Path/{}.XPT').format(l_file))
            
            holder.to_csv(('{}.csv').format(file), index=None)
            
            counter =+ 1
            
        elif counter < len(call_files):
            
            continue
            
        else:
            
            break      

#### NOTE: Be sure to designate your desired file path for saving .XPT files.

All done! Raw data has ben aquired and is ready for processing. 