In [1]:
import psycopg2
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')

In [2]:
with open('form5500_data/config.json') as f:
   conf = json.load(f)
   host = conf['host']
   database = conf['database']
   user = conf['user']
   passw = conf['passw']

In [3]:
conn_str = "host={} dbname={} user={} password={}".format(host, database, user, passw)

In [4]:
conn = psycopg2.connect(conn_str)

<h3>Query - selected fields for 2017 only</h3>

In [None]:
query = '''SELECT sb.SB_EIN, sb.SB_PN, sb.SB_TOT_PARTCP_CNT, sb.SB_EFF_INT_RATE_PRCNT, sb.SB_FNDNG_SHORT_IND, \
       sb.SB_PR_YR_FNDNG_PRCNT, sb.SB_PLAN_TYPE_CODE, 
       f.BUSINESS_CODE, f.SCH_SB_ATTACHED_IND, sb.SB_FNDNG_TGT_PRCNT 
FROM sb_full sb 
LEFT JOIN f5500_full f 
ON sb.SB_EIN = f.SPONS_DFE_EIN AND sb.SB_PN = f.SPONS_DFE_PN 
WHERE sb.SB_PLAN_YEAR_BEGIN_DATE BETWEEN '2017-01-01' AND '2017-12-31';'''

In [None]:
eda_df = pd.read_sql(query, con=conn)

<h3>Check null count by column</h3>

In [None]:
print('Total records: {},\nNull distribution:\n{}'.format(len(eda_df), eda_df.isna().sum()))


In [None]:
pd.notnull(eda_df).sum()

<h3>Drop all nulls for now </h3>

In [None]:
eda_df2 = eda_df.dropna()

In [None]:
pd.notnull(eda_df2).sum()

In [None]:
pd.notnull(eda_df2).sum()

In [None]:
eda_df2.info()

<h3>Create integer versions of feature columns</h3>

<h3>Participant count</h3>

In [None]:
eda_df2['sb_tot_partcp_cnt'] = pd.to_numeric(eda_df2['sb_tot_partcp_cnt'])

<h3>Prior year shortfall indicator</h3>

In [None]:
eda_df2['sb_fndng_short_ind'].unique()

In [None]:
eda_df2['sb_fndng_short_ind'] = pd.to_numeric(eda_df2['sb_fndng_short_ind'])

<h3>Create new column for left 2 digits of business code
<br>Make dummies</h3>


In [None]:
eda_df2['sector'] = pd.to_numeric(eda_df2['business_code'].str[:2])

In [None]:
eda_df2 = pd.concat([eda_df2, pd.get_dummies(eda_df2['sector'].values, prefix_sep='_')],axis=1)

In [None]:
eda_df2.drop('sch_sb_attached_ind', inplace=True, axis=1)
eda_df2.drop('sb_ein', inplace=True, axis=1)
eda_df2.drop('sb_pn', inplace=True, axis=1)
eda_df2.drop('business_code', inplace=True, axis=1)
eda_df2.drop('sb_plan_type_code', inplace=True, axis=1)

In [None]:
eda_df2.info()

<h3>Combine dummies and other features into X featureset</h3>

In [None]:
y_fs = eda_df2['sb_fndng_tgt_prcnt'].values

In [None]:
eda_df2.drop('sb_fndng_tgt_prcnt', inplace=True, axis=1)

In [None]:
def clean_data(df, train=False):
    '''This is all the data "cleaning" for now...will revisit after 
       further EDA
    '''
    df_clean = df.dropna()
    
    #convert string values to integers where appropriate
    df_clean['sb_tot_partcp_cnt'] = pd.to_numeric(df_clean['sb_tot_partcp_cnt'])
    df_clean['sb_fndng_short_ind'] = pd.to_numeric(df_clean['sb_fndng_short_ind'])
    #drop un-needed columns
    df_clean.drop('sch_sb_attached_ind', inplace=True, axis=1)
    df_clean.drop('sb_ein', inplace=True, axis=1)
    df_clean.drop('sb_pn', inplace=True, axis=1)
    #create new column for sector
    df_clean['sector'] = pd.to_numeric(df_clean['business_code'].str[:2])
    
    if train:
        y=df_clean['sb_fndng_tgt_prcnt'].pop
        return df_clean, y
    else:
        return df_clean

<h1>Features EDA</h1>

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
X2 = prelim_feat_df[['ptp_cnt', 'sb_eff_int_rate_prcnt', 'sf_ind', \
                         'sb_pr_yr_fndng_prcnt', 'sb_fndng_tgt_prcnt', \
                          'sb_plan_type_code']]

In [None]:
plt.style.use(['dark_background'])
scatter_matrix(X2, alpha=0.6, figsize=(12, 12), diagonal='hist', hist_kwds={'bins':100})
plt.show()

<h3>EIR - restrict to values between 1 and 10 (appear to be invalid data entries)</h3>

In [None]:
X2[X2['ptp_cnt']<=100].describe()

In [None]:
plt.hist(X2[(X2['ptp_cnt']<4000) & (X2['ptp_cnt']>100)]['ptp_cnt'],bins=100)
plt.title('Distribution of Participant Count (100 < PC < 4000)')
plt.show()

In [None]:
eir_nonzero = X2[(X2['sb_eff_int_rate_prcnt'] >1) & (X2['sb_eff_int_rate_prcnt'] < 10) \
                & (X2['sb_fndng_tgt_prcnt'] > 0) & (X2['sb_fndng_tgt_prcnt'] < 400) & (X2['sb_plan_type_code'] == '1')]
y_eirnonzero = eir_nonzero['sb_fndng_tgt_prcnt'].values

In [None]:
eir_nonzero.shape, y_eirnonzero.shape, eir_nonzero['sb_eff_int_rate_prcnt'].shape

In [None]:
eir_nonzero['sb_eff_int_rate_prcnt'].describe()

In [None]:
plt.hist(eir_nonzero['sb_eff_int_rate_prcnt'],bins=100)
plt.title('Distribution of EIR (1.0 < EIR < 10.0)')
plt.show()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(8,8))
#plt.scatter(X['sb_pr_yr_fndng_prcnt'], y_fs)
plt.scatter(eir_nonzero['sb_eff_int_rate_prcnt'], y_eirnonzero)
plt.xlabel('Effective Interest Rate')
plt.ylabel('Funded Status (AVA basis)')
plt.title('FS vs EIR')
plt.show()

In [None]:
pd.DataFrame(eir_nonzero['sb_eff_int_rate_prcnt']).boxplot(sym=)
plt.show()

In [None]:
sns.boxplot(pd.DataFrame(eir_nonzero['sb_eff_int_rate_prcnt']))
plt.title('Boxplot of EIR')
plt.show()

In [None]:
eir_nonzero['sb_eff_int_rate_prcnt'].describe()

In [None]:
sns.boxplot(prelim_feat_df['sb_fndng_tgt_prcnt'])
plt.title('Boxplot of FS - all plans')
plt.show()

In [None]:
sns.boxplot(eir_nonzero['sb_fndng_tgt_prcnt'])
plt.title('Boxplot of FS (0<FS<400%)')
plt.show()