## References

* [Adult Data Set](https://archive.ics.uci.edu/ml/datasets/Adult) (UCI)
* [Find p-value (significance) in scikit-learn LinearRegression](https://stackoverflow.com/questions/27928275/find-p-value-significance-in-scikit-learn-linearregression) (Stack Overflow)
* [How to compute the standard errors of a logistic regression's coefficients](https://stats.stackexchange.com/questions/89484/how-to-compute-the-standard-errors-of-a-logistic-regressions-coefficients) (Cross Validated)
* [Multiple linear regression](http://docs.roguewave.com/legacy-hpp/anaug/3-2.html) (Rogue Wave)
* [Logistic regression](http://docs.roguewave.com/legacy-hpp/anaug/3-3.html) (Rogue Wave)

In [None]:
import numpy as np
import pandas as pd

from collections import OrderedDict
from problem import Dataset, column_variants, encode_categorical, load_data
from scipy import stats
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
np.random.seed(0)
pd.set_option('display.float_format', '{:.4f}'.format)
pd.set_option('display.height', 100)
pd.set_option('display.max_rows', 100)

In [None]:
def test_linear(x, y, y_hat, coefficients, coefficients_null=0):
    variance = sum((y - y_hat)**2) / (x.shape[0] - x.shape[1])
    covariance = variance * np.linalg.inv(np.dot(x.T, x))
    standard_errors = np.sqrt(np.diag(covariance))
    t_values = (coefficients - coefficients_null) / standard_errors
    p_values = [2 * (1 - stats.t.cdf(np.abs(t), (x.shape[0] - 1))) for t in t_values]
    return pd.DataFrame(index=x.columns, data=OrderedDict([
        ('Coefficient', coefficients),
        ('Standard error', standard_errors),
        ('T-value', t_values),
        ('P-value', p_values),
    ]))

def test_logistic(x, y, y_score, coefficients, coefficients_null=0):
    variance = np.prod(y_score, axis=1)
    covariance = np.linalg.inv(np.dot(x.T, np.multiply(variance[:, None], x)))
    standard_errors = np.sqrt(np.diag(covariance))
    t_values = (coefficients - coefficients_null) / standard_errors
    p_values = [2 * (1 - stats.t.cdf(np.abs(t), (x.shape[0] - 1))) for t in t_values]
    return pd.DataFrame(index=x.columns, data=OrderedDict([
        ('Coefficient', coefficients),
        ('Standard error', standard_errors),
        ('T-value', t_values),
        ('P-value', p_values),
    ]))

In [None]:
unused = [
    'CapitalGain',
    'CapitalLoss',
    'EducationNumber',
    'FinalSamplingWeight',
]

categorical = [
    'Education',
    'MaritalStatus',
    'NativeCountry',
    'Occupation',
    'Race',
    'Relationship',
    'Sex',
    'WorkClass',
]

numerical = [
    'Age',
    'HoursPerWeek',
]

data = load_data('data/train.csv')
data.dropna(inplace=True)
data.drop(unused, axis=1, inplace=True)
data.reset_index(drop=True, inplace=True)

for name in categorical:
    data = encode_categorical(data, name)

scaler = StandardScaler()
scaler.fit(data[numerical])
data[numerical] = scaler.transform(data[numerical])

y = data.pop('Income')
x = data

In [None]:
linear = LinearRegression()
linear.fit(x, y)
y_hat = linear.predict(x)

logistic = LogisticRegression()
logistic.fit(x, y)
y_score = logistic.predict_proba(x)

x = pd.DataFrame({'Intercept': np.ones(len(x))}).join(pd.DataFrame(x))

linear = test_linear(x, y, y_hat, np.append(linear.intercept_, linear.coef_))
logistic = test_logistic(x, y, y_score, np.append(logistic.intercept_, logistic.coef_))

In [None]:
columns = [['Linear', 'Logistic'], linear.columns]
columns = pd.MultiIndex.from_product(columns, names=['Model', 'Summary'])
pd.DataFrame(np.hstack((linear, logistic)), columns=columns, index=linear.index)