In [250]:
import pandas as pd
import numpy as np

In [251]:
"""
This module uses data from the Yale Climate Opinions Maps 2018 (YCOM)
and the 2015 Census to create a joint data set matching Climate opinions
to Census data at the County Level.
"""

'\nThis module uses data from the Yale Climate Opinions Maps 2018 (YCOM)\nand the 2015 Census to create a joint data set matching Climate opinions\nto Census data at the County Level.\n'

In [252]:
def get_data(filepath):
    """
    Function to read in data and return a dataframe.
    Takes in a string that is a filepath to csv file
    Returns a dataframe
    """
    df = pd.read_csv(filepath, encoding='latin-1')
    return df
    
     

In [253]:
ycom['GeoType'].unique()


array(['National', 'State', 'County', 'cd115', 'CBSA'], dtype=object)

In [254]:
def select_geography(ycom_df, geography):
    """
    Helper function for get_ycom_counties
    Takes in Data frame of YCOM 2018 data and filters
    the rows of data within the given geography: 
    'National', 'State', 'County', 'cd115', 'CBSA'
    """
    # selecting only the county rows and reseting the index.
    ycom_county = ycom_df.loc[ycom_df['GeoType'] == geography]
    ycom_county = ycom_county.reset_index(drop=True)
    return ycom_county

   

In [274]:
def fix_ycom_county_names(ycom_county):    
    """
    Function to fix county names to match census.
    Takes in Data frame of YCOM 2018 data. Returns data frame with
    names of counties separate from states with names in correct format.
    """
    # Separate the county name from the state nme
    county_state_sep = pd.DataFrame(ycom_county.GeoName.str.split(',').tolist())
    #keep state name in separate column
    ycom_county['State'] = county_state_sep[1]
    #Keep county names in separate columns
    ycom_county['County'] = county_state_sep[0]
    #Remove the words county, parish
    ycom_county['County'] = ycom_county['County'].str.replace('County', '')
    ycom_county['County'] = ycom_county['County'].str.replace('Parish', '')
    #remove any extra spaces
    ycom_county['County'] = ycom_county['County'].str.strip()
    #fix name with special character
    ycom_county.iloc[1802,60] = 'Dona Ana'
    return ycom_county

In [275]:
def get_ycom_counties(ycom_df):
    """
    Function to clean data to obtain only counties in ycom data
    and fix names to match census.
    Takes in Data frame of YCOM 2018 data 
    """
    ycom_county= select_geography(ycom_df, 'County')
    ycom_county= fix_ycom_county_names(ycom_county)
    return ycom_county


In [276]:
def get_census_counties(census_df):
    """
    Filter out counties not included in the YCOM DATA
    [all areas in the Puerto Rico Territory]
    """
    census = census_df[census_df['State'] != 'Puerto Rico']
    #fix name with special character
    census.iloc[1802,2] = 'Dona Ana'
    return census

In [277]:
def join_data(ycom_county, census_county):
    """
    getting one dataframe from the two datasets 
    """

    data = pd.concat(([ycom_county,census_county]),axis = 1)
    
    return data


In [278]:
ycom = get_data('YCOM_2018_Data.csv')
ycom.head()

Unnamed: 0,GeoType,GeoName,TotalPop,happening,happeningOppose,human,humanOppose,consensus,consensusOppose,affectweather,...,governor,governorOppose,localofficials,localofficialsOppose,prienv,prienvOppose,discuss,discussOppose,mediaweekly,mediaweeklyOppose
0,National,US,213649147,70.172,13.633,56.902,31.861,49.454,27.632,61.586,...,55.994,14.826,56.911,14.744,69.651,27.598,35.617,64.24,22.093,76.745
1,State,Alabama,3261408,63.154,16.532,50.774,37.149,39.295,30.611,52.66,...,52.681,15.861,53.229,15.246,67.488,29.13,28.664,71.261,17.414,81.142
2,State,Alaska,470699,68.62,16.346,52.169,36.086,47.342,32.421,57.479,...,47.414,17.783,52.812,15.644,69.707,26.013,38.979,60.179,23.234,75.335
3,State,Arizona,4440635,69.955,15.062,55.654,33.059,49.135,29.271,60.315,...,54.242,15.775,56.81,15.991,69.31,28.514,38.936,60.838,24.705,74.127
4,State,Arkansas,1973591,63.213,17.783,50.788,36.373,40.364,32.307,53.169,...,51.651,16.544,53.59,16.301,64.031,30.337,30.294,69.547,18.869,79.808


In [279]:
census = get_data('us-census-demographic-data/acs2015_county_data.csv')

In [280]:
ycom_county = get_ycom_counties(ycom)





    
    

In [281]:
census_county = get_census_counties(census)
census_county.columns[2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


'County'

In [282]:
data = join_data(ycom_county, census_county)

In [283]:
len(census_county['County'])
len(data)

3142

In [284]:
# # test if the counties are the same and in order in two datasets.

ycom_c  = ycom_county['County'] 
census_c = census_county['County']

for i in range(len(ycom_c)):
    if ycom_c[i] != census_c[i]:
        print(ycom_c[i] , census_c[i], i)

In [285]:
len(census_county['County'])


3142

In [286]:
# test if the counties are the same and in order in two datasets.
if np.all(ycom_county['County'] == census_county['County']):
    print('Yaaaaay!')

Yaaaaay!


In [287]:
ycom_county= select_geography(ycom, 'County')

In [288]:
ycom_county.head()


Unnamed: 0,GeoType,GeoName,TotalPop,happening,happeningOppose,human,humanOppose,consensus,consensusOppose,affectweather,...,governor,governorOppose,localofficials,localofficialsOppose,prienv,prienvOppose,discuss,discussOppose,mediaweekly,mediaweeklyOppose
0,County,"Autauga County, Alabama",36466,59.476,19.778,46.971,40.041,36.184,34.858,48.943,...,49.58,18.175,49.963,18.062,64.844,31.935,27.253,72.704,17.977,80.654
1,County,"Baldwin County, Alabama",139946,60.062,21.933,46.129,39.811,35.712,37.552,51.123,...,49.989,18.996,49.431,18.707,64.19,32.827,30.199,69.77,20.694,77.999
2,County,"Barbour County, Alabama",18387,67.862,11.588,51.93,37.128,41.492,25.576,57.897,...,57.231,12.873,58.252,12.874,69.24,27.044,28.029,71.832,16.276,82.001
3,County,"Bibb County, Alabama",15750,58.232,20.045,45.592,41.583,34.476,34.132,49.326,...,48.855,18.132,49.933,17.688,63.812,32.726,25.73,74.147,17.284,81.341
4,County,"Blount County, Alabama",39472,52.956,26.374,44.09,41.153,32.142,38.182,43.864,...,45.248,21.647,45.527,21.559,61.112,35.662,26.064,73.849,18.272,80.696


In [289]:
ycom_county= select_geography(ycom, 'County')

In [80]:
ycom_county.head()