## References

* [Adult Data Set](https://archive.ics.uci.edu/ml/datasets/Adult) (UCI)

In [None]:
import numpy as np
import pandas as pd

from common import Dataset, column_variants, encode_categorical, load_data
from collections import OrderedDict
from scipy import stats
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler

np.random.seed(0)

In [None]:
unused = [
    'CapitalGain',
    'CapitalLoss',
    'EducationNumber',
    'FinalSamplingWeight',
]

categorical = [
    'Education',
    'MaritalStatus',
    'NativeCountry',
    'Occupation',
    'Race',
    'Relationship',
    'Sex',
    'WorkClass',
]

numerical = [
    'Age',
    'HoursPerWeek',
]

data = load_data('data/train.csv')
data.dropna(inplace=True)
data.drop(unused, axis=1, inplace=True)
data.reset_index(drop=True, inplace=True)

for name in categorical:
    data = encode_categorical(data, name)

scaler = StandardScaler()
scaler.fit(data[numerical])
data[numerical] = scaler.transform(data[numerical])

data.info(verbose=True)

y = data.pop('Income')
x = data

In [None]:
model = LogisticRegression() if True else LinearRegression()
model.fit(x, y)
y_hat = model.predict(x)

In [None]:
def compute_significance(x, y, y_hat, coefficients, coefficients_null=0):
    residual_variance = sum((y - y_hat)**2) / (x.shape[0] - x.shape[1])
    covariance_matrix = residual_variance * np.linalg.inv(np.dot(x.T, x))
    standard_errors = np.sqrt(covariance_matrix.diagonal())
    t_values = (coefficients - coefficients_null) / standard_errors
    p_values = [2 * (1 - stats.t.cdf(np.abs(t), (x.shape[0] - 1))) for t in t_values]
    return pd.DataFrame(index=x.columns, data=OrderedDict([
        ('Coefficient', coefficients),
        ('Standard error', standard_errors),
        ('T-value', t_values),
        ('P-value', p_values),
    ]))

coefficients = np.append(model.intercept_, model.coef_)
x = pd.DataFrame({'Intercept': np.ones(len(x))}).join(pd.DataFrame(x))

significance = compute_significance(x, y, y_hat, coefficients)

In [None]:
pd.set_option('display.float_format', '{:.4f}'.format)
pd.set_option('display.height', len(significance))
pd.set_option('display.max_rows', len(significance))

significance