In [50]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale
from itertools import chain, combinations
from classifiers import TransparentLasso, TransparentRidge
import statsmodels.api as sm

In [9]:
def load_cancer(data_file):
    data = pd.read_csv(data_file, header=None)
    data = data[data[1] == 'R'] #Filter recurring cases
    
    data = data[list(range(2,13))] #Only the outputs and the mean of measurments remain
    X_train = scale(data.drop(2, axis=1).values)
    y_train = scale(data[2].values)
    
    return X_train, y_train

In [57]:
def generate_coeffs(alphas, estimator, X_train, y_train):
    params = []
    errors = []
    
    for alpha in alphas:
        estimator.set_params(alpha=alpha)
        estimator.fit(X_train, y_train)
        
        weights = estimator.get_weights()
        test_error = estimator.score(X_train, y_train)
        
        params.append(weights)
        errors.append(test_error)
        
    return params, errors

In [11]:
X_train, y_train = load_cancer('cancer_dataset/wpbc.data')
correlation_matrix = np.corrcoef(X_train, rowvar=0)

with pd.option_context('display.max_rows', 10, 'display.max_columns', 10, 'expand_frame_repr', False):
    print(pd.DataFrame(correlation_matrix))

          0         1         2         3         4         5         6         7         8         9
0  1.000000 -0.043771  0.996782  0.991735 -0.105645  0.274791  0.650315  0.789122 -0.134267 -0.527729
1 -0.043771  1.000000 -0.046840 -0.063914 -0.166725 -0.053853 -0.145947 -0.081004 -0.078880 -0.021314
2  0.996782 -0.046840  1.000000  0.990156 -0.065480  0.344832  0.695794  0.823447 -0.115174 -0.470574
3  0.991735 -0.063914  0.990156  1.000000 -0.083396  0.285169  0.660180  0.794234 -0.146569 -0.496505
4 -0.105645 -0.166725 -0.065480 -0.083396  1.000000  0.532535  0.483343  0.428687  0.163066  0.705545
5  0.274791 -0.053853  0.344832  0.285169  0.532535  1.000000  0.774112  0.684798  0.337194  0.567399
6  0.650315 -0.145947  0.695794  0.660180  0.483343  0.774112  1.000000  0.921568  0.104802  0.161835
7  0.789122 -0.081004  0.823447  0.794234  0.428687  0.684798  0.921568  1.000000  0.021777  0.012370
8 -0.134267 -0.078880 -0.115174 -0.146569  0.163066  0.337194  0.104802  0.021777 



In [18]:
X = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train)
results = model.fit()

coefficients = results.params
errors = results.bse
scores = np.divide(coefficients, errors)
ols_test_err = results.mse_total

table_2 = pd.DataFrame({'Coefficients':coefficients, 'Std. Errors':errors, 'Z Scores':scores})
print(table_2)

   Coefficients  Std. Errors  Z Scores
0     -6.600475     4.626888 -1.426547
1     -0.007396     0.137952 -0.053614
2      5.512612     4.738450  1.163379
3      1.440395     1.100555  1.308789
4      0.573710     0.327399  1.752325
5     -0.512345     0.503417 -1.017735
6     -0.090348     0.401208 -0.225190
7     -0.645182     0.609636 -1.058307
8      0.292170     0.155995  1.872949
9     -0.035087     0.456051 -0.076937


In [98]:
best_error = 100
xs = list(range(10))
powerset = chain.from_iterable(combinations(xs,n) for n in range(1, len(xs)+1))
for subset in powerset:
    model = sm.OLS(y_train, X_train[:, subset])
    results = model.fit()

    coefficients = results.params
    std_errors = results.bse
    test_error = results.mse_total
    
    if test_error < best_error:
        best_subset = subset
        best_error = test_error
        best_coeffs = coefficients
        best_std_err = std_errors
        
print(subset)

(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)


In [81]:
alphas = np.logspace(0, 100, num=200)

ridge = TransparentRidge()
lasso = TransparentLasso()

ridge_coeffs, ridge_errors = generate_coeffs(alphas, ridge, X_train, y_train)
lasso_coeffs, lasso_errors = generate_coeffs(alphas, lasso, X_train, y_train)

In [83]:
lasso_coeffs

[array([-0., -0., -0., -0.,  0., -0., -0., -0.,  0.,  0.]),
 array([-0., -0., -0., -0.,  0., -0., -0., -0.,  0.,  0.]),
 array([-0., -0., -0., -0.,  0., -0., -0., -0.,  0.,  0.]),
 array([-0., -0., -0., -0.,  0., -0., -0., -0.,  0.,  0.]),
 array([-0., -0., -0., -0.,  0., -0., -0., -0.,  0.,  0.]),
 array([-0., -0., -0., -0.,  0., -0., -0., -0.,  0.,  0.]),
 array([-0., -0., -0., -0.,  0., -0., -0., -0.,  0.,  0.]),
 array([-0., -0., -0., -0.,  0., -0., -0., -0.,  0.,  0.]),
 array([-0., -0., -0., -0.,  0., -0., -0., -0.,  0.,  0.]),
 array([-0., -0., -0., -0.,  0., -0., -0., -0.,  0.,  0.]),
 array([-0., -0., -0., -0.,  0., -0., -0., -0.,  0.,  0.]),
 array([-0., -0., -0., -0.,  0., -0., -0., -0.,  0.,  0.]),
 array([-0., -0., -0., -0.,  0., -0., -0., -0.,  0.,  0.]),
 array([-0., -0., -0., -0.,  0., -0., -0., -0.,  0.,  0.]),
 array([-0., -0., -0., -0.,  0., -0., -0., -0.,  0.,  0.]),
 array([-0., -0., -0., -0.,  0., -0., -0., -0.,  0.,  0.]),
 array([-0., -0., -0., -0.,  0., -0., -0