# Part 1 - Graded Assignment

In [12]:
from sklearn.datasets import load_digits
import numpy as np
import matplotlib.pyplot as plt 

digits = load_digits(n_class=10)

#Create two rows with numbers
firstrow = np.hstack(digits.images[:5,:,:])
secondrow = np.hstack(digits.images[5:10,:,:])

plt.imshow(np.vstack((firstrow,secondrow)))
print ("The numbers shown are: \n",np.vstack((digits.target[:5], 
                                             digits.target[5:10])))

The numbers shown are: 
 [[0 1 2 3 4]
 [5 6 7 8 9]]


In [3]:
#Make a prediction function h
def prediction_function(x,theta):
    x_predict = 1/(1+np.e**-(x @ theta))
    return x_predict

#Use the output of that function to compute the cost function J:
def cost_function(x_predict,y,theta):
    cost = ((-y).T @ np.log(x_predict)-(1-y).T @ np.log(1-x_predict))/len(y)
    return cost

#Create a function that returns the gradient values, given h (x_predict), y and x:
def compute_gradient(x_predict, y, x):
    dtheta = (x_predict.T-y) @ x / len(y)
    return dtheta

In [4]:
iterations = 25

x = np.reshape(digits.images[:1500],(1500,64))
x_test = np.reshape(digits.images[1500:],(297,64))

target = digits.target[:1500]
target_test = digits.target[1500:]


# calculate theta-vectors for each one-vs-all case; save in one 2D-array theta
def minimisation(alpha,theta):
    for i in range(iterations):
        for j in range(theta.shape[1]):
        #calculate a prediction for one-vs-all case j
            x_predict = prediction_function(x,theta[:,j])
        
        #set all values in target vecot not equal to j to 0
            y = np.zeros(x.shape[0])
            y[target == j] = 1
         
        #calculate theta for one-vs-all case j
            theta[:,j] = theta[:,j] - alpha * compute_gradient(x_predict, y, x)
    return theta
    
# calculate the score of the final prediction function

def test_prediction(x_test, target_test, theta):
    
    # calculate array with all predicted values for target
    predictions = prediction_function(x_test,theta).argmax(axis=1)

    #calculate prediction score as fraction of correct predictions and 
    #total number of instances in target_test
    prediction_score = sum(target_test == predictions) / len(target_test)
    return prediction_score

In [8]:
# calculates scores for range a of alphas
def test_alphas(alpha_vector):
    score_vector = np.array([])
    for i in range(len(alpha_vector)):
        theta = minimisation(alpha_vector[i], np.zeros((64,10)))
        score_vector = np.append(score_vector,test_prediction(x_test,target_test,theta))
        theta = np.zeros((64,10))
    return score_vector

#finds the best alpha based on calculated scores
def findBestAlpha(score_vector, alpha_vector):
    index = score_vector.argmax()
    return alpha_vector[index]

alpha_vector = np.linspace(0.001,0.1,100)

score_vector = test_alphas(alpha_vector)

# stores best alpha value for further use
bestAlpha = findBestAlpha(score_vector, alpha_vector)

theta_opt = minimisation(0.01, np.zeros((64,10)))

print("The score of my prediction is:", test_prediction(x_test,target_test,theta_opt))

The score of my prediction is: 0.855218855219


In [6]:
''' 
Summary:
The first three functions are very much alike the three functions from the
linear-regression assignment (vectorised solution). The minimisation takes 
two nested loops, one for the iteratinos, one to apply gradient descent to each of the
n (number of classes) prediction-functions. The score of the function is a little 
inaccurate. In fact, the value of a two-class [0,1] prediction function has a statistical 
relevance (P(Y=1; x theta)). In our case, this information gets lost, since we only use
the index of the max of the prediction_function output to predict a class. This loss of
information is mirrored in the calculated score, that does not take into account
with what confidence the respective class is predicted. 

It would be possible to determin a stop condition for the iterations. However, 
since we calculate for n different prediction_functions, this stop condition 
would have to apply for every hypothesis-function individually. Such a function
causes unnecessary convolution. It is thus more reasonable to choose a higher value
for iterations. This holds particularly, since we choose an optimal value for 
alpha. 

Given a prediction score of 85% as a target value, my own prediction score of
0.8181, calculated from the testing_sample, seems to suffice to assume prevention
of overfitting

    Finally, the calculation of an optimised alpha is very time intensive 
and not efficient. It only serves the purpose 
to show that a perfect alpha can be gained by comparison to other alphas. 
'''

' \nSummary:\nThe first three functions are very much alike the three functions from the\nlinear-regression assignment (vectorised solution). The minimisation takes \ntwo nested loops, one for the iteratinos, one to apply gradient descent to each of the\nn (number of classes) prediction-functions. The score of the function is a little \ninaccurate. In fact, the value of a two-class [0,1] prediction function has a statistical \nrelevance (P(Y=1; x theta)). In our case, this information gets lost, since we only use\nthe index of the max of the prediction_function output to predict a class. This loss of\ninformation is mirrored in the calculated score, that does not take into account\nwith what confidence the respective class is predicted. \n\nIt would be possible to determin a stop condition for the iterations. However, \nsince we calculate for n different prediction_functions, this stop condition \nwould have to apply for every hypothesis-function individually. Such a function\ncauses u

# Part 2 - Graded Assignment

![exercise 1](Assignment1_Part2_1.jpg)
![exercise 3](Assignment1_Part2_2.jpg)

