In [1]:
import numpy as np, os
from pathlib import Path

# Q7

(a) My strategy is to do a linear fit, and then sort by the absolute value of the weights. The weights that are small likely correspond to noise, the weights that are large likely contribute to y. In case the features are rather similar to each other, I can also use ridge regression with a strong lambda term to further reduce the noisy inputs. With this strategy, sorting by the absolute value of the weights should yield 10 features that have relatively high coefficients, and then the rest of the parameters should have very low weights. There should be a clearly observed 'gap' in the absolute value of the weights between the 10th highest and 11th highest. If the gap is not large, then the regularization strength is too large.  

(b)
- Using 0-indexing: ```[1, 2, 4, 6, 10, 12, 16, 18, 22, 26]```
- Using 1-indexing: ```[2, 3, 5, 7, 11, 13, 17, 19, 23, 27]```

In [23]:
with open("data/HW4/mystery.dat", "r") as f:
  lines = np.array([list(map(float, l.strip().split(","))) for l in f.readlines() if not l.isspace()])
print(lines.shape)

X = lines[:,:-1]
y = lines[:,[-1]]

print(X.shape, y.shape)

(101, 101)
(101, 100) (101, 1)


In [81]:
from sklearn import linear_model
import operator


for i in range(0, 40, 5):
  reg = linear_model.Ridge(alpha=i).fit(X, y)

  w, b = reg.coef_, reg.intercept_

  out = sorted(zip(list(w.flatten()), range(len(w.flatten()))), key = lambda x : -abs(x[0]))
  res = sorted(map(operator.itemgetter(1), out[:10]))

  print(f"alpha={i}")
  print("\tOutput indexes: ", res)
  print("\tOutput indexes (1-indexed): ", [v + 1 for v in res])

  print("\tStrongest 10: ", np.round(np.array([v[0] for v in out[:10]]), 3))
  print("\tNext Strongest 5: ", np.round(np.array([v[0] for v in out[10:15]]), 3))
  print(f"\tGap: [{abs(out[9][0]) - abs(out[10][0])}]")
  # for i, v in enumerate(out[:20]):
  #   if i == 10:
  #     print("VV== Should be sharp drop below ==VV")
  #   print(v)


alpha=0
	Output indexes:  [1, 10, 20, 23, 26, 33, 43, 57, 67, 70]
	Output indexes (1-indexed):  [2, 11, 21, 24, 27, 34, 44, 58, 68, 71]
	Strongest 10:  [ 3.347  3.204  2.866 -2.501  2.248  2.22  -2.145  2.087 -2.073 -2.024]
	Next Strongest 5:  [ 1.922  1.889  1.859 -1.83  -1.8  ]
	Gap: [0.10115185856595121]
alpha=5
	Output indexes:  [1, 2, 4, 6, 10, 12, 16, 18, 22, 26]
	Output indexes (1-indexed):  [2, 3, 5, 7, 11, 13, 17, 19, 23, 27]
	Strongest 10:  [1.094 1.073 0.936 0.928 0.839 0.824 0.815 0.777 0.737 0.644]
	Next Strongest 5:  [0.374 0.308 0.304 0.277 0.271]
	Gap: [0.270319002169907]
alpha=10
	Output indexes:  [1, 2, 4, 6, 10, 12, 16, 18, 22, 26]
	Output indexes (1-indexed):  [2, 3, 5, 7, 11, 13, 17, 19, 23, 27]
	Strongest 10:  [0.992 0.961 0.865 0.791 0.759 0.753 0.722 0.708 0.705 0.566]
	Next Strongest 5:  [ 0.355  0.264  0.241  0.237 -0.221]
	Gap: [0.211190224943121]
alpha=15
	Output indexes:  [1, 2, 4, 6, 10, 12, 16, 18, 22, 26]
	Output indexes (1-indexed):  [2, 3, 5, 7, 11, 13

# Q8

In [75]:
import pandas as pd

In [77]:
df = pd.read_csv("data/HW4/heart.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [87]:
X = np.array(df.iloc[:,:-1])
y = np.array(df.iloc[:,[-1]])

print(X.shape, y.shape)

(303, 13) (303, 1)


In [103]:
np.random.seed(123)
idx = np.random.choice(303, 303, replace=False)
tr_idx, te_idx = idx[:200], idx[200:]

X_tr, y_tr = X[tr_idx], y[tr_idx]
X_te, y_te = X[te_idx], y[te_idx]

X_tr_mu = np.mean(X_tr, axis=0, keepdims=True)
X_tr_sig = np.std(X_tr, axis=0, keepdims=True)

X_tr = (X_tr - X_tr_mu) / X_tr_sig
X_te = (X_te - X_tr_mu) / X_tr_sig

print(X_tr.shape, y_tr.shape, X_te.shape, y_te.shape)

(200, 13) (200, 1) (103, 13) (103, 1)


In [108]:
reg = linear_model.LogisticRegression().fit(X_tr, y_tr.flatten())

w, b = reg.coef_, reg.intercept_
print(w, b)

[[-0.03662717 -0.97827814  0.80466309 -0.4959262  -0.34684898  0.15028609
   0.17252423  0.69095302 -0.42864215 -0.75648048  0.51113222 -0.83144789
  -0.49069434]] [-0.02337655]
