In [136]:
import pandas as pd
import numpy as np
import random
import math
import warnings
import scipy.sparse as scs
import time

**Exercise 1**

In [137]:
def load_Xy(path):

    #Use built-in pandas function to read csv file into a dataframe
    df = pd.read_csv(path)

    #Convert the dataframe to a numpy array
    data_array = df.to_numpy()

    #Separate the features X from the labels y
    X,y = data_array[:,:-1], data_array[:,-1]

    #Replace the 0 label with -1
    y[y==0] = -1 

    #return X and y
    return X,y


In [138]:
#Call the function and load the data
X,y = load_Xy('data.csv')

In [139]:
#Count how many malicious apps there are
# Filter the y array only where y == -1, then get its dimensions, and then its length
y[ y==-1 ].shape[0]

14632

In [140]:
#Let's count how many non-zero entries are in X
non_zero_entries = X[X!=0].shape[0]

#Total elements in X
tot_entries = X.shape[0]*X.shape[1]

print("Non-zero entries: {}".format(non_zero_entries))
print("Total entries: {}".format(tot_entries))
print("Ratio: {}".format(non_zero_entries/tot_entries))

Non-zero entries: 277180
Total entries: 2522552
Ratio: 0.10988078739308446


Only the $10\%$ of the elements of the matrix $X$ are non-zero, thus $X$ has a sparsity of $90\%$.

In [141]:
np.unique(X)

array([0, 1], dtype=int64)

**Exercise 2**

In [142]:
def train_test_random(X,y,r):    
    if not (r > 0 and r < 1):
        print("r must be between 0 and 1")
        return

    train_size = int(y.size*r)
    index = range(y.size)
    index_train = random.sample(index, train_size)
    index_train.sort()
    y_train = []
    y_test = []
    X_train = []
    X_test = []
    for i in range(y.size):
        if i in index_train:
            y_train.append(y[i])
            X_train.append(X[i,:])
        else:
            y_test.append(y[i])
            X_test.append(X[i,:])
    X_train = np.vstack(X_train)
    X_test = np.vstack(X_test)
    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_random(X,y, 0.8)

In [143]:
def train_test_split(X,y,r):

    #Check that r is between 0 and 1
    if not ( r > 0 and r < 1):
        print("r must be between 0 and 1")
        return
        
    #get number of rows
    rows = X.shape[0]

    #create a indices array for the rows
    indices = [i for i in range(rows)]

    #shuffle the array randomly
    np.random.shuffle(indices)

    #Now that the indices are randomized, we can split in train and test
    train_indices = indices[:int(rows*r)]
    test_indices = indices[int(rows*r):]

    X_train = X[train_indices,:]
    X_test = X[test_indices,:]
    y_train = y[train_indices]
    y_test = y[test_indices]

    return X_train, y_train, X_test, y_test


In [144]:
X_train, y_train, X_test, y_test = train_test_split(X,y,0.8)
X_train1, y_train1, X_test1, y_test1 = train_test_split(X,y,0.8)

**Exercise 3**

Let $X \in \R^{m \times n}$ be the data matrix with data columns $\textbf{x}_i,\ldots, \textbf{x}_n, i = 1,\ldots, n$, vector $\textbf{y} \in \{-1,1\}^n$, and a weight vector $\textbf{w} \in \R^m$. If the product $y_i \textbf{x}_i^\intercal \textbf{w} > 0$, the point is correctly classified, and if $y_i \textbf{x}_i^\intercal \textbf{w} <0$, the point is misclassified. We count the points $\textbf{x}_i^\intercal \textbf{w} = 0$ as misclassified, as there is no way of confirming its (mis)classification.

We define a function $f$ that assigns value $0$ to misclassified points, and $1$ for correctly classified points. Mathematically, this function would be defined as $f: \R \rightarrow \{0,1\}$:

$f(s) \coloneqq \frac{1}{2}(\text{sign}(s) + 1)$,

with $f(0) = 0$.

The function for the number of correctly classified points $F: \R^m \rightarrow \Z_{\geq 0}$ is then defined:

$F(\textbf{w}) = \sum_{i = 1} ^ n f(y_i\textbf{x}_i^\intercal \textbf{w})$.

Since Python does not have a sign function, we write a function that fulfills this task. 

We generate a random weight vector $w \in \R^m$ with values $w_j \in [-10,10), \forall j = 1,\ldots,m$, such that the chances of $w_j$ being positive or negative are equal and the chance of $w_j = 0$ is approximately zero, but we include a safety measure to stop the function in the case $\textbf{w}$ contains at least one $0$ element.

The function for the sum of correctly classified points is defined as:

In [145]:
def correct_classification(X,y,w):
    if np.all(w) == False:
        print("w contains at least one element 0")
        return
    I = y*np.dot(X, w) #compute the product y_ix_iw elementwise, I is the indicator vector
    C = 0 #starting value of the number of correct points
    for i in range(I.size):
        if I[i] <= 0: #misclassified
            continue
        else:   #correctly classified
            C += 1
    return I,C
dim = len(X[0])
w = np.random.uniform(-10, 10, (dim)) #random uniform weight vector with equal chances of positive and negative elements
I,C = correct_classification(X, y, w) #test with X and y

**Exercise 4**

The Hinge-Loss function for logistic regression is:

$g(s) = \log{(1+e^{-s})}$.

As such, the cost function for logistic regression is defined as:

$J(\textbf{w}) = \sum_{i=1}^{n} \log{(1+e^{-y_i \textbf{x}_i^T \textbf{w}})} + \frac{\lambda}{2} \lVert \textbf{w} \rVert^2$,

with gradient:

$\nabla J (\textbf{w}) = \sum_{i=1}^{n} -\frac {e^{-y_i \textbf{x}_i^T \textbf{w}}}{1+e^{-y_i \textbf{x}_i^T \textbf{w}}}y_i\boldsymbol{x}_i + \lambda \textbf{w}$.

This gradient is determined as follows: consider the functions $g(s) = \log {(1+e^{-s})}$ and $h_i (\textbf{w}) = y_i \textbf{x}i^T \textbf{w}$. The cost function then becomes:

$J(\textbf{w}) = \sum_{i=1}^{n} g (h_i (\textbf{w})) + \frac{\lambda}{2}\lVert\textbf{w} \rVert^2$.

Note that the derivatives of $g$ and $h_i$ are defined as:

$g^\prime (s) = - \frac {e^{-s}}  {1+e^{-s}} = - \frac {1} {1+e^{s}}\quad$ , $\quad\nabla h_i (\textbf{w}) = y_i \textbf{x}_i$,

and trivially: $\frac{d}{d\textbf{w}}\left(\frac{\lambda}{2}\lVert\textbf{w} \rVert^2\right) = \lambda \textbf{w}$.

Then, due to the chain rule of the Jacobian, the gradient of the cost function becomes:

$\nabla J (\textbf{w}) = \sum_{i=1}^{n} g^\prime (y_i \textbf{x}_i^T \textbf{w}) \cdot \nabla h_i (\textbf{w}) + \lambda \textbf{w}$, 

obtaining the formula above.

**Exercise 5**

In [146]:
warnings.filterwarnings('ignore') #to ignore the RunTimeWarning
def logistic_regression(X,y,a,l,K):
    dim = len(X[0])
    w = np.zeros(dim)
    loss = np.zeros (len(y))
    for i in range(K):
        I = y*np.dot(X,w)
        loss[i] = np.sum( np.log (1+ np.exp (-I) ) ) + l/2*np.dot (w.T,w)
        I = (1 + np.exp(I))
        I = - 1 / I
        I = I * y
        J = np.dot(X.T,I) + l*w
        w = w - a*J
    print (loss)
    return w

t0 = time.time()
w = logistic_regression(X,y,0.8,0.5,1000)
t1 = time.time()
print(w)
print('Running time: ', str(t1-t0), ' seconds')

[20331.39310018            inf            inf ...     0.
     0.             0.        ]
[-4.14524370e+02 -1.24657353e+03  7.89022059e+02 -1.98955882e+02
 -1.66905163e+02 -3.69365732e+03  7.17507353e+02  5.53382353e+01
  1.12454121e+03 -1.49483088e+03  4.55882353e-01  2.55183569e+03
 -3.45857683e+03 -1.00146324e+03 -8.37573529e+02 -6.42573529e+02
 -2.48823529e+01  2.85844148e+03 -1.32500000e+02  1.16938918e+02
  6.45234543e+02  4.99502746e+03 -1.50000000e+02 -2.96766964e+02
  3.10070624e+02 -8.43170048e+03  5.28067718e+02 -1.18257353e+03
 -1.34904412e+02 -1.08365441e+03 -1.53966296e+03 -3.48308824e+02
 -6.27573529e+02 -6.97000000e+02  2.46786765e+02 -1.26904412e+02
 -1.45058824e+02 -1.19757353e+03 -1.68000000e+02 -7.37199265e+03
  1.92198529e+02 -8.32573529e+02 -1.24532353e+03 -3.62352941e+02
  7.98843330e+02 -8.30073529e+02 -2.85000000e+02 -8.01617647e+01
  1.47451664e+02 -6.33823529e+02  7.68195624e+02 -5.08823529e+02
 -1.23455882e+02  1.79909364e+03  5.87867647e+02 -1.50301471e+02
 

In [147]:
#now using sparse linear algebra
def log_reg_sparse(X,y,a,l,K):
    dim = len(X[0])
    X_sparse = scs.csr_matrix(X) #changes X from dense to CSR format
    w = np.zeros(dim)
    loss = np.zeros(len(y))
    for i in range(K):
        I = y*(X_sparse@w)
        loss[i] = np.sum( np.log (1+ np.exp (-I) ) ) + l/2*np.dot (w.T,w)
        J = -1/(np.exp(I)+1)*y
        J = J@X_sparse + l*w
        w = w - a*J
    print(loss)
    return w

t0_s = time.time()
w = log_reg_sparse(X,y,0.8,0.5,1000)
t1_s = time.time()
print(w)
print('Running time: ', str(t1_s-t0_s), ' seconds')

[20331.39310018            inf            inf ...     0.
     0.             0.        ]
[-4.14524370e+02 -1.24657353e+03  7.89022059e+02 -1.98955882e+02
 -1.66905163e+02 -3.69365732e+03  7.17507353e+02  5.53382353e+01
  1.12454121e+03 -1.49483088e+03  4.55882353e-01  2.55183569e+03
 -3.45857683e+03 -1.00146324e+03 -8.37573529e+02 -6.42573529e+02
 -2.48823529e+01  2.85844148e+03 -1.32500000e+02  1.16938918e+02
  6.45234543e+02  4.99502746e+03 -1.50000000e+02 -2.96766964e+02
  3.10070624e+02 -8.43170048e+03  5.28067718e+02 -1.18257353e+03
 -1.34904412e+02 -1.08365441e+03 -1.53966296e+03 -3.48308824e+02
 -6.27573529e+02 -6.97000000e+02  2.46786765e+02 -1.26904412e+02
 -1.45058824e+02 -1.19757353e+03 -1.68000000e+02 -7.37199265e+03
  1.92198529e+02 -8.32573529e+02 -1.24532353e+03 -3.62352941e+02
  7.98843330e+02 -8.30073529e+02 -2.85000000e+02 -8.01617647e+01
  1.47451664e+02 -6.33823529e+02  7.68195624e+02 -5.08823529e+02
 -1.23455882e+02  1.79909364e+03  5.87867647e+02 -1.50301471e+02
 

**Exercise 6**