In [70]:
# basic imports
import csv
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

# scikit learn
from sklearn.preprocessing import scale
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.cross_validation import KFold, cross_val_score, train_test_split

import scipy as sp
from scipy.optimize import minimize

from time import time

import warnings
warnings.filterwarnings('ignore')

In [71]:
from data_processing import *
X, Y, feature_names, receptor_names, protein_names = read_data()
X, feature_names = remove_constant_features(X, feature_names, eps=1e-2)

Protein names equality check: True


In [72]:
piv = np.load('features_local_cond.npy')
piv_ind = np.array([f in piv for f in feature_names])

In [73]:
X_k, Y_k = select_tasks(X, Y, receptor_ind=[0, 1, 2])

In [74]:
X_k = scale(X_k)

In [75]:
X_train, X_test, Y_train, Y_test = train_test_split(X_k, Y_k, test_size=.5, random_state=100)

#### Direct matrix evaluation

Let's consider

$$\min_{W} \sum_{i=1}^n \| W^T x_i - y_i \|_2 + \gamma \sum_{j=1}^d \| w^i \|_2 , $$

where $w^i$ is the $i$-th row of $W$.
Note that here the residual $\| W^T x_i - y_i \|_2$ is not squared for further convenience.

$$\min_W \| X W - Y \|_{2, 1} + \gamma \| W \|_{2, 1}$$

We have obtained an **unconstrained optimization** problem.

Let's reformulate it as a **constrained optimization**:

$$\begin{aligned}
& \min_{W, E} & & \| E \|_{2,1} + \| W \|_{2,1}\\
& \text{S.t.} & & X W + \gamma E = Y
\end{aligned}$$

Or, taking $A = (X, \gamma I)$ and $U = (W, E)^T$, we can write

$$\begin{aligned}
& \min_{U} & & \| U \|_{2,1}\\
& \text{S.t.} & & A U = Y
\end{aligned}$$

Lagrangian 

$$L(U) = \| U \|_{2,1} - Tr(\Lambda^T (AU - Y))$$

$$\dfrac{\partial L(U)}{\partial U} = 2DU - A^T\Lambda = 0,$$

where $D = diag(d_i)$, $d_i = \dfrac{1}{2 \| u^i \|_2}$. Now multiplying by $A D^{-1}$ and using $A U = Y$:

$$2 A U - A D^{-1} A^T \Lambda = 0 \Longrightarrow 2 Y - A D^{-1} A^T \Lambda = 0 \Longrightarrow
\Lambda = 2 (A D^{-1} A^T)^{-1} Y$$

$$U = D^{-1} A^T (A D^{-1} A^T)^{-1} Y$$

In [8]:
ind = np.all(Y != 999, axis=1)
X = X[ind]
Y = Y[ind]

X = scale(X, axis = 0)

In [76]:
n = X_train.shape[0]
d = X_train.shape[1]
m = n + d
gamma = 0.5
A = np.concatenate((X_train, gamma*np.identity(n)), axis=1)

In [77]:
start = time()
t = 0
maxiter = 20
epsilon = 1e-2

diaginv = np.ones(m, dtype='double')
olddiag = np.inf*diaginv

while t < maxiter and np.linalg.norm(diaginv - olddiag) > epsilon:
    Dinv = np.diag(diaginv)
    U = Dinv.dot(A.T).dot(
            np.linalg.inv(A.dot(Dinv).dot(A.T))).dot(Y_train)
    olddiag = diaginv
    diaginv = 2*np.linalg.norm(U, axis=1)
    t += 1
    
time_direct = time() - start
    
W = U[:d, :]

In [60]:
Y_prd = X_test.dot(W)

In [78]:
print '%i iterations passed. Run took %.2f seconds' %(t, time_direct)

14 iterations passed. Run took 68.30 seconds


In [63]:
for i in range(Y_test.shape[1]):
    print 'AUC on test set for task %i is %.2f' %(i, roc_auc_score(Y_test[:, i], Y_prd[:, i]))

AUC on test set for task 0 is 0.74
AUC on test set for task 1 is 0.50
AUC on test set for task 2 is 0.94
AUC on test set for task 3 is 0.73
AUC on test set for task 4 is 0.96


#### Gradient method for $l_{2,1}$ regularization

$$F(W) =  \| X W - Y \|_{2, 1} + \gamma \| W \|_{2, 1}$$

$$\min_W F(W)$$

$$\dfrac{\partial F}{\partial W} = X^T D_1 (XW - Y) + \gamma D_2 W,$$

where $D_1 = diag(d_1)$, $d_1^i = \dfrac{1}{\| X^i W - Y^i \|_2}$, and $D_2 = diag(d_2)$, $d_2^i = \dfrac{1}{\| W^i \|_2}$

In [52]:
class MTLGradientMethods():
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y     
        
    def l21(self, A):
        return np.linalg.norm(np.linalg.norm(A, axis=1, ord=2), ord=1)

    def fun(self, W):
        W_mat = W.reshape(self.d, self.T)
        return self.l21(self.X.dot(W_mat) - self.Y) + self.gamma*self.l21(W_mat)

    def gradient(self, W):
        W_mat = W.reshape(self.d, self.T)
        A = self.X.dot(W_mat) - self.Y
        D1 = np.diag( 1.0 / np.linalg.norm(A, axis=1, ord=2) )
        D2 = np.diag( 1.0 / np.linalg.norm(W_mat, axis=1, ord=2) )
        return (self.X.T.dot(D1.dot(A)) + self.gamma*D2.dot(W_mat)).reshape(self.d*self.T)
    
    def solve(self, gamma = 2):
        self.gamma = gamma
        
        # remove missing values
        ind = np.all(self.Y != 999, axis=1)
        self.X = self.X[ind]
        self.Y = self.Y[ind]
        
        # remove features with small variance
        eps = 1e-2
        ind = np.var(self.X, axis = 0) > eps
        self.X = self.X[:, ind]
        
        self.n = self.X.shape[0]
        self.d = self.X.shape[1]
        self.T = self.Y.shape[1]
        
        # scale matrix X
        self.X = scale(self.X, axis = 0)
        
        # initial guess
        W0 = np.ones(self.d*self.T)/self.n
        
        sol = minimize(self.fun, W0, jac=self.gradient, options={'maxiter': 10})
        self.W = sol.x.reshape(self.d, self.T)
        
        return self.W
    
    def predict(self, X_new):
        return X_new.dot(self.W)
        

In [53]:
myGrad = MTLGradientMethods(X_train, Y_train)
W = myGrad.solve()

In [54]:
myGrad.W = np.ones((myGrad.d, myGrad.T))/(myGrad.n)
#myGrad.W = np.zeros((myGrad.d, myGrad.T))
#print myGrad.fun(np.ones((myGrad.d, myGrad.T))/(myGrad.n))
#print myGrad.fun(myGrad.W)

Y_pred = myGrad.predict(X_test)

for i in range(3):
    print 'AUC for task %i is %.3f' %(i, roc_auc_score(Y_test[:, i], Y_prd[:, i]))

AUC for task 0 is 0.723
AUC for task 1 is 0.824
AUC for task 2 is 0.826
