In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')

In [None]:
from get_clean_data_BT import get_data, clean_data

In [None]:
df = get_data(2016)

In [None]:
df

In [None]:
df.info()

<h1>Data Clean Up and Parsing</h1>

<h3>Convert object counts to numeric</h3>

In [None]:
df_clean = clean_data(df)

In [None]:
clean_list = ['part_cnt_PY', 'part_cnt_CY']#, 'sb_act_partcp_cnt','sb_term_partcp_cnt', 'sb_rtd_partcp_cnt']

for c in clean_list:
    df[c] = pd.to_numeric(df[c])

df['eir'] = np.where(df['eir']>100, df['eir']/100, df['eir'])

In [None]:
df.info()

<h3>Add informational columns to parse data</h3>

In [None]:
df['pay_related'] = df['type_pension_bnft_code'].str.contains('1A')
df['cash_bal'] = df['type_pension_bnft_code'].str.contains('1C')
df['frozen'] = df['type_pension_bnft_code'].str.contains('1I')
df['pbgc_takeover'] = df['type_pension_bnft_code'].str.contains('1H')
df['not_qual'] = df['type_pension_bnft_code'].str.contains('3B','3C')

In [None]:
df.groupby('not_qual')['ein'].count()

<h3>Restrict analysis to: </h3>
<h5><ul><li>active participant count between 100 and 300,000
        <li>funding target non-zero
        <li>filing as tax qualified
    </ul>

In [None]:
df.info()

In [None]:
df = df[(df['fndng_tgt_2016'] > 0) & (df['fndng_tgt_2017'] > 0) &\
      (df['part_cnt_2017'] < 300000) & (df['part_cnt_2017'] > 100) & \
       (df['not_qual'] == False)]

In [None]:
df.dropna(inplace=True)

In [None]:
df.describe()

In [None]:
X = df[['eir', 'part_cnt_2016', 'fndng_tgt_2016', 'tgt_nrml_cost_2016','distrib_drt_partcp_amt']]

In [None]:
y = df['fndng_tgt_2017']

In [None]:
X.describe()

<h1>Regression Tree</h1>

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dtr1 = DecisionTreeRegressor(max_depth=2)
dtr2 = DecisionTreeRegressor(max_depth=3)

In [None]:
dtr1.fit(X,y)
dtr2.fit(X,y)

In [None]:
y_1 = dtr1.predict(X)
y_2 = dtr2.predict(X)

In [None]:
X.shape, y.shape, len(X)

In [None]:
plt.figure(figsize=(20,8))
xx = np.linspace(0, len(X), num=len(X))
plt.scatter(xx, y, s=20, edgecolor="black", c="darkorange", label="data")
plt.plot(xx, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
plt.plot(xx, y_2, color="yellowgreen", label="max_depth=3", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()

In [None]:
dtr1.score(X,y)

In [None]:
dtr2.score(X,y)