In [252]:
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy.special import expit

# Problem 1
## Dataset Generation

Write a function to **generate a training set** of size $m$
- randomly generate a weight vector $w \in \mathbb{R}^{10}$, normalize length
- generate a training set $\{(x_i , y_i)\}$ of size m
  - $x_i$: random vector in $\mathbb{R}^{10}$ from $\textbf{N}(0, I)$
  - $y_i$: $\{0, +1\}$ with $P[y = +1] = \sigma(w \cdot x_i)$ and $P[y = 0] = 1 - \sigma(w \cdot x_i)$

In [253]:
def sigmoid(x):
    return 1/(1+np.exp(-x))
     
def generate_vector(normalize=False):
    v = np.random.randn(10)
    if normalize:
        sum = 0
        for i in v:
            sum += i * i 
        norm = math.sqrt(sum)
        v = [i*1/norm for i in v]
    return v

def generate_label(x_i,w):
    s = expit(np.dot(w,x_i))
    X = np.random.uniform(low=0,high=1)
    y_i = 0
    if X <= s:
        y_i = 1
    return y_i

def generate_data(m):
    x = []
    for i in range(m):
        x.append(generate_vector(normalize=False))
    x = np.array(x)
    return x

def new_data(m):
    w = generate_vector(normalize=True) 
    x = generate_data(m)
    y =[generate_label(i,w) for i in x ]
    return w, x, y 

# w_ = generate_vector(normalize=True) 
# x_m = generate_data(10)
# y_m = np.array([generate_label(i,w_) for i in x_m ])


## Algorithm 1: logistic regression

The goal is to learn $w$.  Algorithm 1 is logistic
  regression (you may use the built-in method LogisticRegression for this. Use max_iter=1000).

In [254]:
from sklearn.linear_model import LogisticRegression

def algorithm1(X,Y):
    clf = LogisticRegression(random_state=0, max_iter=1000).fit(X, Y)
    w_prime = np.array(clf.coef_[0])
    return w_prime

## Algorithm 2: gradient descent with square loss

Define square loss as
$$L_i(w^{(t)}) = \frac{1}{2} \left( \sigma(w^{(t)} \cdot x) - y_i \right)^2$$

  Algorithm 2 is
  gradient descent with respect to square loss (code this
  up yourself -- run for 1000 iterations, use step size eta = 0.01).

In [264]:
w_t = np.zeros(10)
w_star, x , y = new_data(10)

for j in range(10):
    for i in range(10): 
        f = expit(np.dot(w_t, x[i]))
        loss = 0.5*(f - y[i])**2
        w_t -= 0.01*loss



0.5
0.5024054926971206
-0.5040960531356005
0.4997268957252963
0.5013071712056889
0.49909601917050767
0.4948656958934071
-0.5115727408172213
0.48250116491129064
0.5087782417307009
0.48875049740641047
0.5263764131541385
-0.5246091448460825
0.4988101205270909
0.5046048770115346
0.49726878820790554
0.48621255736563185
-0.528274581520767
0.4604124228516663
0.5186523703730174
0.47736258760156375
0.5505457792841946
-0.5453217826587012
0.49788037061654655
0.5079488749861033
0.4954157387453892
0.4774452324292859
-0.5451496147691148
0.438163056537869
0.5286585351882065
0.4658284515734924
0.5748370901265436
-0.5661943148820727
0.49693604062082203
0.5113446394801678
0.49353371227441145
0.4685544063729437
-0.5621851530347122
0.4158110030687997
0.5388055381548867
0.4541413702229719
0.5991665663426531
-0.5871814666146378
0.49597553383983356
0.5147975824063877
0.4916195718561377
0.4595314250605883
-0.5793649167739143
0.39342057762507265
0.5491011543580353
0.442296119203083
0.6234431824949219
-0.608231

## Algorithm 3: stochastic gradient descent with square loss
Similar to gradient descent, except we use the gradient at a single random training point every iteration.

## Evaluation

Measure error $\|w - \hat{w}\|_2$ for each method at different sample size. For any
  fixed value of $m$, choose many different $w$'s and average the
  values $\|w - 
  \hat{w}\|_2$ for Algorithms 1, 2 and 3.  Plot the results
  for for each algorithm as you make $m$ large (use $m=50, 100, 150, 200, 250$).
  Also record, for each algorithm, the time taken to run the overall experiment.

In [171]:
a1 = []

for i in range(w_norms_a1.size):
    M = [50,100,150,200,250]
    w_norms_a1 = np.zeros(10)
    for m in M:
        w_star, x , y = new_data(m)
        w_new = algorithm1(x,y)
        w_sub = np.subtract(w_star,w_new)
        w_norm_diff = np.linalg.norm(w_sub)
        w_norms_a1[i] = w_norm_diff
    print(np.average(w_norms_a1))
    a1.append(np.average(w_norms_a1))

    




0.06439645630379462
0.0531074375162686
0.0521224498583167
0.030675101640195924
0.03492726280937618
0.046498243791706376
0.04100560678755705
0.03073333286844171
0.046051028181084294
0.05682052170392674


# Problem 2

In [126]:
from sklearn import datasets

In [127]:
cancer = datasets.load_breast_cancer()

For each depth in $1, \dots, 5$, instantiate an AdaBoost classifier with the base learner set to be a decision tree of that depth (set `n_estimators=10` and `learning_rate=1`), and then record the 10-fold cross-validated error on the entire breast cancer data set. Plot the resulting curve of accuracy against base classifier depth. Use $101$ as your random state for both the base learner as well as the AdaBoost classifier every time.