In [1]:
## General Imports

import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (summarize,
poly,
ModelSpec as MS)
from statsmodels.stats.anova import anova_lm
from statsmodels.formula.api import ols

## Lab Specific Imports
from pygam import (s as s_gam,
l as l_gam,
f as f_gam,
LinearGAM,
LogisticGAM)
from ISLP.transforms import (BSpline,
NaturalSpline)
from ISLP.models import bs, ns
from ISLP.pygam import (approx_lam,
degrees_of_freedom,
plot as plot_gam,
anova as anova_gam)

## sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import SplineTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [2]:
## load wage data
wage = load_data('Wage')

# isolate variables
y = wage['wage']
age = wage['age']

display(wage)

Unnamed: 0,year,age,maritl,race,education,region,jobclass,health,health_ins,logwage,wage
0,2006,18,1. Never Married,1. White,1. < HS Grad,2. Middle Atlantic,1. Industrial,1. <=Good,2. No,4.318063,75.043154
1,2004,24,1. Never Married,1. White,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,2. No,4.255273,70.476020
2,2003,45,2. Married,1. White,3. Some College,2. Middle Atlantic,1. Industrial,1. <=Good,1. Yes,4.875061,130.982177
3,2003,43,2. Married,3. Asian,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,1. Yes,5.041393,154.685293
4,2005,50,4. Divorced,1. White,2. HS Grad,2. Middle Atlantic,2. Information,1. <=Good,1. Yes,4.318063,75.043154
...,...,...,...,...,...,...,...,...,...,...,...
2995,2008,44,2. Married,1. White,3. Some College,2. Middle Atlantic,1. Industrial,2. >=Very Good,1. Yes,5.041393,154.685293
2996,2007,30,2. Married,1. White,2. HS Grad,2. Middle Atlantic,1. Industrial,2. >=Very Good,2. No,4.602060,99.689464
2997,2005,27,2. Married,2. Black,1. < HS Grad,2. Middle Atlantic,1. Industrial,1. <=Good,2. No,4.193125,66.229408
2998,2005,27,1. Never Married,1. White,3. Some College,2. Middle Atlantic,1. Industrial,2. >=Very Good,1. Yes,4.477121,87.981033


In [8]:
one_hot = pd.get_dummies(wage)

one_hot.corr(numeric_only=False)

Unnamed: 0,year,age,logwage,wage,maritl_1. Never Married,maritl_2. Married,maritl_3. Widowed,maritl_4. Divorced,maritl_5. Separated,race_1. White,...,education_3. Some College,education_4. College Grad,education_5. Advanced Degree,region_2. Middle Atlantic,jobclass_1. Industrial,jobclass_2. Information,health_1. <=Good,health_2. >=Very Good,health_ins_1. Yes,health_ins_2. No
year,1.0,0.038425,0.076239,0.065544,0.016966,-0.008739,-0.02495,-0.010042,0.011646,-0.036374,...,-0.024823,0.001241,0.024057,,0.006155,-0.006155,0.001938,-0.001938,-0.008091,0.008091
age,0.038425,1.0,0.217889,0.195637,-0.447154,0.317322,0.057936,0.12155,0.01662,-0.003767,...,-0.069587,0.016924,0.091385,,-0.090691,0.090691,0.138907,-0.138907,0.142589,-0.142589
logwage,0.076239,0.217889,1.0,0.950683,-0.267208,0.279236,-0.01888,-0.053638,-0.030043,0.048192,...,-0.027077,0.181614,0.351706,,-0.205408,0.205408,-0.15828,0.15828,0.369733,-0.369733
wage,0.065544,0.195637,0.950683,1.0,-0.238644,0.256713,-0.023278,-0.055318,-0.034353,0.045018,...,-0.049767,0.165898,0.382369,,-0.206897,0.206897,-0.152337,0.152337,0.30831,-0.30831
maritl_1. Never Married,0.016966,-0.447154,-0.267208,-0.238644,1.0,-0.785539,-0.041905,-0.14178,-0.071731,-0.069931,...,0.046402,-0.009571,-0.074297,,0.038081,-0.038081,0.001205,-0.001205,-0.096578,0.096578
maritl_2. Married,-0.008739,0.317322,0.279236,0.256713,-0.785539,1.0,-0.11948,-0.404246,-0.204521,0.058128,...,-0.049685,0.023098,0.096113,,-0.030924,0.030924,-0.040183,0.040183,0.075109,-0.075109
maritl_3. Widowed,-0.02495,0.057936,-0.01888,-0.023278,-0.041905,-0.11948,1.0,-0.021565,-0.01091,-0.041145,...,-0.021589,0.006623,-0.008403,,0.018676,-0.018676,0.014561,-0.014561,0.007367,-0.007367
maritl_4. Divorced,-0.010042,0.12155,-0.053638,-0.055318,-0.14178,-0.404246,-0.021565,1.0,-0.036913,0.032741,...,0.025069,-0.017602,-0.026432,,-0.005277,0.005277,0.060524,-0.060524,0.02689,-0.02689
maritl_5. Separated,0.011646,0.01662,-0.030043,-0.034353,-0.071731,-0.204521,-0.01091,-0.036913,1.0,-0.022755,...,-0.005529,-0.021063,-0.048477,,-0.011468,0.011468,0.012482,-0.012482,-0.017196,0.017196
race_1. White,-0.036374,-0.003767,0.048192,0.045018,-0.069931,0.058128,-0.041145,0.032741,-0.022755,1.0,...,-0.0114,0.020419,-0.0332,,0.085677,-0.085677,-0.025878,0.025878,0.034508,-0.034508
