In [1]:
import numpy as np
import pandas as pd
# import datetime as datetime
from sklearn.impute import SimpleImputer

In [2]:
## Load data
poll_data = pd.read_csv('data/polling/texaspoll_aug21_q13.csv')
zip_codes = pd.read_csv('data/tx_zip_codes.csv', index_col=0)

In [3]:
## Rename columns
name_map = {'FAMECON':'date', 
            'A lot better off':'tx_is_a_lot_better', 
            'Somewhat better off':'tx_is_a_little_better', 
            'About the same ':'tx_is_abt_same',
            'Somewhat worse off':'tx_is_a_little_worse', 
            'A lot worse off':'tx_is_a_lot_worse', 
            "Don't know":'tx_is_dont_know', 
            'TOTAL BETTER':'tx_is_better',
            'TOTAL WORSE':'tx_is_worse'}
poll_data = poll_data.rename(columns=name_map)

In [4]:
## Drop useless data
poll_data = poll_data.drop(index=[34,35,36,37])
poll_data.tail()

Unnamed: 0,date,tx_is_a_lot_better,tx_is_a_little_better,tx_is_abt_same,tx_is_a_little_worse,tx_is_a_lot_worse,tx_is_dont_know,tx_is_better,tx_is_worse
29,May-12,4.0,15.0,48.0,19.0,13.0,2.0,19.0,32.0
30,Feb-12,3.0,16.0,45.0,23.0,11.0,1.0,20.0,34.0
31,Oct-11,2.0,13.0,40.0,27.0,15.0,1.0,16.0,42.0
32,May-11,3.0,16.0,40.0,25.0,16.0,2.0,18.0,41.0
33,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0


In [5]:
## Convert column to date time and add month and year column
poll_data['Time'] = pd.to_datetime(poll_data['date'], format='%b-%y')
# poll_data['month'] = pd.DatetimeIndex(poll_data['date']).month
# poll_data['year'] = pd.DatetimeIndex(poll_data['date']).year


In [6]:
# poll_data

In [7]:
dates_list = []
new_date = poll_data['Time'].min()
end_date = poll_data['Time'].max()

while new_date != end_date:
    dates_list.append(new_date)
    new_date += pd.DateOffset(months=1)
dates = pd.DataFrame(dates_list)
dates.columns = ['Time']

In [8]:
# merged = pd.concat([merged.iloc[:,0:1], imp], axis=1)
# imp

In [9]:
poll_data = dates.merge(poll_data, how='left', on='Time').sort_values('Time')
imp = SimpleImputer(strategy="mean")
imp = imp.fit_transform(poll_data.iloc[:,2:])
merged_imputed = pd.concat([poll_data.iloc[:,0:2], pd.DataFrame(imp)], axis=1)
merged_imputed.columns = list(poll_data.columns)

In [10]:
poll_data.columns

Index(['Time', 'date', 'tx_is_a_lot_better', 'tx_is_a_little_better',
       'tx_is_abt_same', 'tx_is_a_little_worse', 'tx_is_a_lot_worse',
       'tx_is_dont_know', 'tx_is_better', 'tx_is_worse'],
      dtype='object')

In [11]:
poll_data[['Time', 'date']]

Unnamed: 0,Time,date
0,2011-02-01,Feb-11
1,2011-03-01,
2,2011-04-01,
3,2011-05-01,May-11
4,2011-06-01,
...,...,...
120,2021-01-01,
121,2021-02-01,Feb-21
122,2021-03-01,Mar-21
123,2021-04-01,Apr-21


In [12]:
merged_interp = poll_data[['tx_is_a_lot_better', 'tx_is_a_little_better',
       'tx_is_abt_same', 'tx_is_a_little_worse', 'tx_is_a_lot_worse',
       'tx_is_dont_know', 'tx_is_better', 'tx_is_worse']].interpolate(method='pad')
# bluh.head(20)
poll_data = pd.concat([poll_data[['Time', 'date']], merged_interp], axis=1)
poll_data

Unnamed: 0,Time,date,tx_is_a_lot_better,tx_is_a_little_better,tx_is_abt_same,tx_is_a_little_worse,tx_is_a_lot_worse,tx_is_dont_know,tx_is_better,tx_is_worse
0,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0
1,2011-03-01,,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0
2,2011-04-01,,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0
3,2011-05-01,May-11,3.0,16.0,40.0,25.0,16.0,2.0,18.0,41.0
4,2011-06-01,,3.0,16.0,40.0,25.0,16.0,2.0,18.0,41.0
...,...,...,...,...,...,...,...,...,...,...
120,2021-01-01,,9.0,14.0,44.0,20.0,11.0,2.0,23.0,31.0
121,2021-02-01,Feb-21,5.0,13.0,49.0,19.0,10.0,5.0,18.0,29.0
122,2021-03-01,Mar-21,5.0,17.0,49.0,20.0,8.0,2.0,22.0,28.0
123,2021-04-01,Apr-21,5.0,16.0,53.0,16.0,7.0,3.0,21.0,23.0


In [13]:
# STOP!

In [14]:
# merged_imputed = pd.DataFrame(imp, columns = list(merged.columns))
# merged_imputed

In [15]:
## add a location column to both frames for merging, and merge
poll_data['state'] = 'TX'
zip_codes['state'] = 'TX'

poll_data = poll_data.merge(zip_codes, how='left', on='state')
poll_data.rename(columns={'zip_codes':'zip_code'})

Unnamed: 0,Time,date,tx_is_a_lot_better,tx_is_a_little_better,tx_is_abt_same,tx_is_a_little_worse,tx_is_a_lot_worse,tx_is_dont_know,tx_is_better,tx_is_worse,state,zip_code
0,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,77494
1,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,77449
2,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,77084
3,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,78130
4,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,77573
...,...,...,...,...,...,...,...,...,...,...,...,...
26870,2021-05-01,,5.0,16.0,53.0,16.0,7.0,3.0,21.0,23.0,TX,78756
26871,2021-05-01,,5.0,16.0,53.0,16.0,7.0,3.0,21.0,23.0,TX,78721
26872,2021-05-01,,5.0,16.0,53.0,16.0,7.0,3.0,21.0,23.0,TX,78252
26873,2021-05-01,,5.0,16.0,53.0,16.0,7.0,3.0,21.0,23.0,TX,79936


In [16]:
poll_data.head(50)

Unnamed: 0,Time,date,tx_is_a_lot_better,tx_is_a_little_better,tx_is_abt_same,tx_is_a_little_worse,tx_is_a_lot_worse,tx_is_dont_know,tx_is_better,tx_is_worse,state,zip_codes
0,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,77494
1,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,77449
2,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,77084
3,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,78130
4,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,77573
5,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,77584
6,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,78613
7,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,77433
8,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,75052
9,2011-02-01,Feb-11,3.0,17.0,45.0,24.0,11.0,1.0,20.0,35.0,TX,77429


In [17]:

## Save
poll_data.to_csv('poll_clean.csv')

In [18]:
# county_zip_map = pd.read_csv('data/zip_county_map.csv', index_col=0)
# county_zip_map['county'] = county_zip_map['county'].str.lower()
# county_zip_map['state'] = 'TX'

In [19]:
# county_zip_map

In [20]:
# polling_mapped = pd.merge(left=poll_data, 
#                              right = county_zip_map, 
#                              how='left', 
#                              left_on='state', 
#                              right_on = 'state')
# # covid_copy_mapped[covid_copy_mapped['county'] == 'el paso']
# # covid_copy_mapped = covid_copy_mapped.dropna(subset=['zip_code']).reset_index(drop=True)

In [21]:
polling_mapped

NameError: name 'polling_mapped' is not defined

In [None]:
# dates_list = []
# new_date = polling_mapped['Time'].min()
# end_date = polling_mapped['Time'].max()

# while new_date != end_date:
#     new_date += pd.DateOffset(months=1)
#     dates_list.append(new_date)
# #     print(new_date)
    
# dates = pd.DataFrame(dates_list)
# dates.columns = ['date']
# dates