In [1]:
import pandas as pd
import numpy as np
import math
import os
import glob

In [None]:
# wy = pd.read_csv('qwi_wy_sa_f_gc_ns_op_u.csv.gz', compression='gzip')
# wy.head()
# list(wy.columns.values)
# wy.geography.unique()
# wy.year.unique()

In [None]:
os.chdir('2014.q1-q4.by_area')

extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
QCEW14 = pd.concat([pd.read_csv(f) for f in all_filenames])

# convert int64 column to string dtype
QCEW14['area_fips'] = QCEW14['area_fips'].astype(str)
# set out put string length
width = 5
# prefix zeroes to form same length
QCEW14['area_fips'] = QCEW14['area_fips'].str.zfill(width)
QCEW14.dtypes

In [None]:
cp_ur = pd.read_csv('CountyPairsUnemploymentRate.csv')

## slice column 'Period' as 'Month' column
cp_ur['Month'] = cp_ur['Period'].str.slice(1,3)

## combine 'Year' and 'Month' as a new column
cp_ur['YearMonth'] = pd.to_datetime(cp_ur[['Year', 'Month']].assign(Day=1)).dt.strftime('%Y-%m')

cp_ur = cp_ur[['county_id', 'YearMonth', 'Value']]
## a = cp_ur.index[cp_ur['Value']=="-"].tolist()
## [99668, 99669, 99670, 99671, 99672, 99673, 99674, 99675, 99676, 99677]
cp_ur.loc[99668:99677, 'Value'] = np.nan
cp_ur['Value'] = pd.to_numeric(cp_ur['Value'])

## reshape dataframe
monthly_ur = cp_ur.pivot(index='county_id', columns='YearMonth')['Value']
monthly_ur = monthly_ur.rename_axis(columns=None).reset_index()
monthly_ur = monthly_ur.drop(monthly_ur.columns[-1], axis=1)
monthly_ur.set_index('county_id', inplace=True)

## calculate quarterly unemployment rate, reshape dataframe
quarterly_ur = monthly_ur.groupby(pd.PeriodIndex(monthly_ur.columns, freq='Q'), axis=1).mean()
quarterly_ur.reset_index(level=0, inplace=True)
quarterly_ur = quarterly_ur.melt(id_vars=['county_id'], var_name='Period', 
                  value_name='UnemploymentRate').sort_values(by=['county_id', 'Period']).reset_index(drop=True)

## quarterly_ur.loc[quarterly_ur['UnemploymentRate'].idxmax()]
### county_id              6025
### Period               2011Q3
### UnemploymentRate    31.5333
### Name: 5486, dtype: object

quarterly_ur.to_csv('QuarterlyUnemploymentRate', index=False)

In [None]:
## calculate potential maximun benefit duration for all states during all periods
bd = pd.read_excel('MaxBD.xlsx', index_col=None)
bd.fillna(0, inplace=True)
cols = list(bd.columns)[3:]
bd[cols] += 26
bd[cols] = bd[cols].astype(int)
## extract columns to form new dataframe
state_info = bd.iloc[:, 0:3]

bd.to_csv('MaxBenefitDuration.csv', index=False)
state_info.to_csv('StateInfo.csv', index=False)

In [None]:
mbd = pd.read_csv('MaxBenefitDuration.csv')
qur = pd.read_csv('QuarterlyUnemploymentRate')
sinfo = pd.read_csv('StateInfo.csv')

## reshape dataframe of maximun benefit duration
mbd.drop(mbd.columns[0:2], axis=1, inplace=True)
mbd = mbd.melt(id_vars=['state_id'], var_name='Period',
              value_name='Weeks').sort_values(by=['state_id', 'Period']).reset_index(drop=True)

qur['state_id'] = qur['county_id'].astype(str).str[:-3].astype(np.int64)
qur['Year'] = qur['Period'].str.slice(0,4).astype(np.int64)
qur = qur[qur['Year'].between(2006, 2016)]

qur_mbd = pd.merge(qur, mbd, how='left', on=['state_id', 'Period'])
qur_mbd = pd.merge(qur_mbd, sinfo, how='left', on='state_id')
qur_mbd = qur_mbd[['state_id', 'State', 'Abbr.','county_id', 'Period', 'UnemploymentRate', 'Weeks']]

qur_mbd.to_csv('CountyURandMBD.csv', index=False)

In [2]:
# Joining dataframes to one with information on county-pairs, unemployment rate and maximum benefit durations.

df = pd.read_stata('QCEWindustry_minwage_contig.dta')

## extract county-pair ids
countypairs = df[['pair_id']]
countypairs['pair_id'].replace('', np.nan, inplace=True)
countypairs = countypairs.dropna()

## only contains unique pairs
pairID = countypairs['pair_id'].unique()
pairID = pd.DataFrame(pairID, columns=['pair_id'])

new = pairID['pair_id'].str.split('-', n=1, expand=True)
pairID['front'] = new[0].astype(int)
pairID['back'] = new[1].astype(int)

pairID = pairID.sort_values(by='pair_id')
pairID.reset_index(drop=True,inplace=True)

df_f = pd.read_csv('CountyURandMBD.csv')
df_b = df_f

df_f = df_f.add_prefix('f.')
df_b = df_b.add_prefix('b.')

df_f.rename(columns={'f.county_id': 'front', 'f.Period': 'Period'}, inplace=True)
df_b.rename(columns={'b.county_id': 'back', 'b.Period': 'Period'}, inplace=True)

info = pd.merge(pairID, df_f, how='left', on='front')
info = pd.merge(info, df_b, how='left', on=['back', 'Period'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [3]:
info.dtypes

pair_id                object
front                   int64
back                    int64
f.state_id              int64
f.State                object
f.Abbr.                object
Period                 object
f.UnemploymentRate    float64
f.Weeks                 int64
b.state_id              int64
b.State                object
b.Abbr.                object
b.UnemploymentRate    float64
b.Weeks                 int64
dtype: object