In [1]:
import pandas as pd
import numpy as np
import random

**Exercise 1**

In [2]:
def load_Xy(path):

    #Use built-in pandas function to read csv file into a dataframe
    df = pd.read_csv(path)

    #Convert the dataframe to a numpy array
    data_array = df.to_numpy()

    #Separate the features X from the labels y
    X,y = data_array[:,:-1], data_array[:,-1]

    #Replace the 0 label with -1
    y[y==0] = -1 

    #return X and y
    return X,y


In [3]:
#Call the function and load the data
X,y = load_Xy('data.csv')

In [4]:
#Count how many malicious apps there are
# Filter the y array only where y == -1, then get its dimensions, and then its length
y[ y==-1 ].shape[0]

14632

In [5]:
#Let's count how many non-zero entries are in X
non_zero_entries = X[X!=0].shape[0]

#Total elements in X
tot_entries = X.shape[0]*X.shape[1]

print("Non-zero entries: {}".format(non_zero_entries))
print("Total entries: {}".format(tot_entries))
print("Ratio: {}".format(non_zero_entries/tot_entries))

Non-zero entries: 277180
Total entries: 2522552
Ratio: 0.10988078739308446


Only the $10\%$ of the elements of the matrix $X$ are non-zero, thus $X$ has a sparsity of $90\%$.

In [6]:
np.unique(X)

array([0, 1], dtype=int64)

**Exercise 2**

In [7]:
def train_test_random(X,y,r):    
    if not (r > 0 and r < 1):
        print("r must be between 0 and 1")
        return

    train_size = int(y.size*r)
    index = range(y.size)
    index_train = random.sample(index, train_size)
    index_train.sort()
    y_train = []
    y_test = []
    X_train = []
    X_test = []
    for i in range(y.size):
        if i in index_train:
            y_train.append(y[i])
            X_train.append(X[i,:])
        else:
            y_test.append(y[i])
            X_test.append(X[i,:])
    X_train = np.vstack(X_train)
    X_test = np.vstack(X_test)
    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_random(X,y, 0.8)

In [8]:
def train_test_split(X,y,r):

    #Check that r is between 0 and 1
    if not ( r > 0 and r < 1):
        print("r must be between 0 and 1")
        return
        
    #get number of rows
    rows = X.shape[0]

    #create a indices array for the rows
    indices = [i for i in range(rows)]

    #shuffle the array randomly
    np.random.shuffle(indices)

    #Now that the indices are randomized, we can split in train and test
    train_indices = indices[:int(rows*r)]
    test_indices = indices[int(rows*r):]

    X_train = X[train_indices,:]
    X_test = X[test_indices,:]
    y_train = y[train_indices]
    y_test = y[test_indices]

    return X_train, y_train, X_test, y_test


In [9]:
X_train, y_train, X_test, y_test = train_test_split(X,y,0.8)
X_train1, y_train1, X_test1, y_test1 = train_test_split(X,y,0.8)

**Exercise 3**

Let $X \in \R^{n \times m}$ be the data matrix with data rows $x_i,\ldots, x_n, i = 1,\ldots, n$, vector $y \in \{-1,1\}^n$, and a weight vector $w \in \R^m$. If the product $y_i x_i w > 0$, the point is correctly classified, and if $y_i x_i w <0$, the point is misclassified. We count the points $x_i w = 0$ as misclassified, as there is no way of confirming its (mis)classification.

We define a function $f$ that assigns value $0$ to misclassified points, and $1$ for correctly classified points. Mathematically, this function would be defined as $f: \R \rightarrow \{0,1\}$:

$f(s) \coloneqq \frac{1}{2}(\text{sign}(s) + 1)$,

with $f(0) = 0$.

The function for the number of correctly classified points $F: \R^m \rightarrow \Z_{\geq 0}$ is then defined:

$F(w) = \sum_{i = 1} ^ n f(y_ix_iw)$.

Since Python does not have a sign function, we write a function that fulfills this task. 

We generate a random weight vector $w \in \R^m$ with values $w_j \in [-10,10), \forall j = 1,\ldots,m$, such that the chances of $w_j$ being positive or negative are equal and the chance of $w_j = 0$ is approximately zero, but we include a safety measure to stop the function in the case $w$ contains at least one $0$ element.

The function for the sum of correctly classified points is defined as:

In [12]:
def correct_classification(X,y,w):
    if np.all(w) == False:
        print("w contains at least one element 0")
        return
    I = y*np.dot(X, w) #compute the product y_ix_iw elementwise, I is the indicator vector
    C = 0 #starting value of the number of correct points
    for i in range(I.size):
        if I[i] <= 0: #misclassified
            continue
        else:   #correctly classified
            C += 1
    return I,C
dim = len(X[0])
w = np.random.uniform(-10, 10, (dim)) #random uniform weight vector with equal chances of positive and negative elements
I,C = correct_classification(X, y, w) #test with X and y

**Exercise 4**

The hinge-loss function for logistic regression is:

$f(s) = \log{(1+e^{-s})}$

That is making the cost function for a logistic regression be:

$J(\textbf{w}) = \sum_{i=1}^{n} \log{(1+e^{-y_i \textbf{x}_i^T \textbf{w}})} + \frac{\lambda}{2} \lVert \textbf{w} \rVert^2$

Therefore, the gradient of the cost function $J$ is:

$\nabla J (\textbf{w}) = \sum_{i=1}^{n} -\frac {e^{-y_i \textbf{x}_i^T \textbf{w}}} {1+e^{-y_i \textbf{x}_i^T \textbf{w}}} + \lambda \textbf{w}$

To calculate it, first we have considered $g(x) = \log {(1+e^{-x})}$ and $h_i (\textbf{w}) = y_i \textbf{x}i^T \textbf{w}$ as function such that $J(\textbf{w}) = \sum{i=1}^{n} g (h_i (\textbf{w}))$ and calculated the derivative of each function:

$g \prime (x) = - \frac {e^{-x}}  {1+e^{-x}} = - \frac {1} {1+e^{x}}\quad$ and $\quad\nabla h_i (\textbf{w}) = y_i \textbf{x}_i$

Then, due to the chain rule of the Jacobian, we determined the gradient of the cost function as:
$\nabla J (\textbf{w}) = \sum_{i=1}^{n} g \prime (y_i \textbf{x}_i^T \textbf{w}) \cdot \nabla h_i (\textbf{w})$ obtaining the formula above.

**Exercise 5**

In [None]:
def logistic_regression(X,y,a,l,K):
    dim = len(X[0])
    w = np.random.uniform(-10, 10, (dim))
    for i in K:
        for j in range(y.size):
            I = y*np.dot(X,w)
    return w

**Exercise 6**