In [1]:
import numpy as np
import pandas as pd
# import datetime as datetime
from sklearn.impute import SimpleImputer

### Load and clean Polling data and zip code list

In [2]:
## Load data
poll_data = pd.read_csv('data/polling/texaspoll_aug21_q13.csv')
zip_codes = pd.read_csv('data/tx_zip_codes.csv', index_col=0)

In [3]:
## Rename columns
name_map = {'FAMECON':'date', 
            'A lot better off':'tx_is_a_lot_better', 
            'Somewhat better off':'tx_is_a_little_better', 
            'About the same ':'tx_is_abt_same',
            'Somewhat worse off':'tx_is_a_little_worse', 
            'A lot worse off':'tx_is_a_lot_worse', 
            "Don't know":'tx_is_dont_know', 
            'TOTAL BETTER':'tx_is_better',
            'TOTAL WORSE':'tx_is_worse'}
poll_data = poll_data.rename(columns=name_map)

In [4]:
## Drop useless data
poll_data = poll_data.drop(index=[34,35,36,37])
poll_data.tail()

Unnamed: 0,date,tx_is_a_lot_better,tx_is_a_little_better,tx_is_abt_same,tx_is_a_little_worse,tx_is_a_lot_worse,tx_is_dont_know,tx_is_better,tx_is_worse
29,May-12,4.0,15.0,48.0,19.0,13.0,2.0,19.0,32.0
30,Feb-12,3.0,16.0,45.0,23.0,11.0,1.0,20.0,34.0
31,Oct-11,2.0,13.0,40.0,27.0,15.0,1.0,16.0,42.0
32,May-11,3.0,16.0,40.0,25.0,16.0,2.0,18.0,41.0
33,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0


### Add month and year values  for all time from earliest entry to latest

In [5]:
## Convert column to date time 
poll_data['Time'] = pd.to_datetime(poll_data['date'], format='%b-%y')


In [6]:
# Create list of all months and years to fill in missing date values
dates_list = []
new_date = poll_data['Time'].min()
end_date = poll_data['Time'].max()

while new_date != end_date:
    dates_list.append(new_date)
    new_date += pd.DateOffset(months=1)
dates = pd.DataFrame(dates_list)
dates.columns = ['Time']

### Merge polling data to dates list to add in-between dates to polling data

In [7]:
## Merge sets and impute missing values
poll_data = dates.merge(poll_data, how='left', on='Time').sort_values('Time')

In [8]:
# interpolate values by padding using the previous value
merged_interp = poll_data[['tx_is_a_lot_better', 'tx_is_a_little_better',
       'tx_is_abt_same', 'tx_is_a_little_worse', 'tx_is_a_lot_worse',
       'tx_is_dont_know', 'tx_is_better', 'tx_is_worse']].interpolate(method='pad')
poll_data = pd.concat([poll_data[['Time', 'date']], merged_interp], axis=1)

### Add each zip code to all dataframe entries, merge.

In [9]:
## add a location column to both frames for merging, and merge
poll_data['state'] = 'TX'
zip_codes['state'] = 'TX'

poll_data = poll_data.merge(zip_codes, how='left', on='state')
poll_data.rename(columns={'zip_codes':'zip_code'})

Unnamed: 0,Time,date,tx_is_a_lot_better,tx_is_a_little_better,tx_is_abt_same,tx_is_a_little_worse,tx_is_a_lot_worse,tx_is_dont_know,tx_is_better,tx_is_worse,state,zip_code
0,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,77494
1,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,77449
2,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,77084
3,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,78130
4,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,77573
...,...,...,...,...,...,...,...,...,...,...,...,...
26870,2021-05-01,,5.0,16.0,53.0,16.0,7.0,3.0,21.0,23.0,TX,78756
26871,2021-05-01,,5.0,16.0,53.0,16.0,7.0,3.0,21.0,23.0,TX,78721
26872,2021-05-01,,5.0,16.0,53.0,16.0,7.0,3.0,21.0,23.0,TX,78252
26873,2021-05-01,,5.0,16.0,53.0,16.0,7.0,3.0,21.0,23.0,TX,79936


### Save.

In [10]:

# ## Save
# poll_data.to_csv('poll_clean.csv')