# Multinomial Logistic Regression

## Arguments

1) train input: path to the training input .tsv file 

2) validation input: path to the validation input .tsv file 

3) test input: path to the test input .tsv file 

4) train out: path to output .labels file to which the prediction on the training data should be written 

5) test out: path to output .labels file to which the prediction on the test data should be written

6) metrics out: path of the output .txt file to which metrics such as train and test error should be written

7) num epoch: integer specifying the number of times SGD loops through all of the training data
(e.g., if num epoch equals 5, then each training example will be used in SGD 5 times).
    
8) feature flag: integer taking value 1 or 2 that specifies whether to construct the Model 1 feature set or the Model 2 feature set —that is, if feature_flag==1 use Model 1 features; if feature_flag==2 use Model 2 features

In [None]:
import sys
import numpy as np

In [None]:
f_train= sys.argv[1]
#train_d= data(f_train)


f_validation = sys.argv[2]
#validation_d = data(f_validation)
f_test = sys.argv[3]
#test_d= data(f_test)
trainlabels = sys.argv[4]
testlabels = sys.argv[5]
metrics = sys.argv[6]
Epoch = int(sys.argv[7])
t_M = sys.argv[8]

In [None]:
class data:

    def __init__(self, file_name):
        data = []
        data.append(['B','O'])
        self.x = []
        self.y = []
        self.lines = []

        with open(file_name, 'r') as openf:
            for line in openf:
                self.lines.append(line)
                if line != '\n':
                    data.append(line.replace('\n', '').split('\t'))
                else:
                    data.append(['E','O'])
                    data.append(['B','O'])
        data.append(['E','O'])

        for i in data:
            self.x.append(i[0])
            self.y.append(i[1])
            

In [None]:
class matrix:

    def __init__(self, data,t_M, attrib, labels):

        k = len(labels)

        if t_M == '1':

            data_x = []
            data_y = []
            for i, row in enumerate(data.x):
                if (row != 'E') and (row != 'B'):
                    for j, input in enumerate(attrib):
                        if input == data.x[i]:
                            data_x.append(j)
                            continue
                    for j, label in enumerate(labels):
                        if label == data.y[i]:
                            data_y.append(j)
                            continue

            n = len(data_x)
            m = len(attrib) + 1 
            self.x = np.zeros((n, m), dtype=int)
            for i, row in enumerate(data_x):
                self.x[i, -1] = 1  #bias term
                self.x[i, row] = 1
            self.y = np.array(data_y)

        elif t_M == '2':

            data_x = []
            data_y = []
            for i, row in enumerate(data.x):
                if (row != 'E') and (row != 'B'):
                    for j, input in enumerate(attrib):
                        if input == data.x[i-1]:
                            x1 = j
                        elif input == data.x[i]:
                            x2 = j
                        elif input == data.x[i+1]:
                            x3 = j
                    data_x.append([x1, x2, x3])

                    for j, label in enumerate(labels):
                        if label == data.y[i]:
                            data_y.append(j)
                            continue

            n = len(data_x)
            m = len(attrib) * 3 + 1 #  
            self.x = np.zeros((n, m), dtype=int)
            for i, row in enumerate(data_x):
                for j, col in enumerate(row):
                    self.x[i, j * len(attrib) + col] = 1 
                self.x[i, -1] = 1
            self.y = np.array(data_y)

In [None]:
def Ji(x, y, theta):
    value=0
    n = x.shape[0]
    m = x.shape[1]
    k = theta.shape[0]
    mat = np.zeros((n, k))
    for i in range(n):
        mat[i][y[i]] = 1
    value = -(1./n) * ( mat.T * (np.dot(x, theta.T).T - np.log(np.exp(np.dot(x, theta.T)).sum(axis=1))) ).sum()

    return value

In [None]:
def GD(x, y, step, theta):
    n = x.shape[0]
    m = x.shape[1]
    k = theta.shape[0]
    for i in range(n):
        mat = np.zeros(k)
        mat[y[i]] = 1
        dj = - np.outer(( mat - np.exp(np.dot(x[i], theta.T)) / np.exp(np.dot(x[i], theta.T)).sum() ), x[i])
        theta = theta - step * dj
    return theta


In [None]:
def pred(x, theta):

    max_val = np.argmax(np.dot(x, theta.T), axis=1)
    return max_val

In [None]:
def main():
    train_d= data(f_train)
    validation_d = data(f_validation)
    test_d= data(f_test)
    attrib = np.unique(train_d.x)
    labels = np.sort(np.unique(train_d.y)) 
    v_tr = matrix(train_d, t_M, attrib, labels)
    v_v = matrix(validation_d,t_M,attrib, labels)
    v_te = matrix(test_d,t_M,attrib, labels)

    k = len(labels)
    step = 0.5
    metric = ''
    
    if t_M == '1':
        m = len(attrib) + 1 # with bias term 
    elif t_M == '2':
        m = len(attrib) * 3 + 1
        
    theta = np.zeros((k, m))

    for i in range(Epoch):

        theta = GD(v_tr.x, v_tr.y, step, theta)

        J_train = Ji(v_tr.x, v_tr.y, theta)
        J_validation = Ji(v_v.x, v_v.y, theta)

        metric += 'epoch={} likelihood(train): {:.6f}\n'.format(i+1, J_train)
        metric += 'epoch={} likelihood(validation): {:.6f}\n'.format(i+1, J_validation)

    p_train = pred(v_tr.x, theta)
    error_train = 1 - float(sum(p_train==v_tr.y))/len(v_tr.x)
    metric += 'error(train): {:.6f}\n'.format(error_train)

    p_test = pred(v_te.x, theta)
    error_test = 1 - float(sum(p_test==v_te.y))/len(v_te.x)
    metric += 'error(test): {:.6f}\n'.format(error_test)

    labels_train = ''
    counter = 0
    for line in train_d.lines:
        if line != '\n':
            labels_train += labels[p_train[counter]]
            labels_train += '\n'
            counter += 1
        else:
            labels_train += '\n'

    labels_test = ''
    counter = 0
    for line in test_d.lines:
        if line != '\n':
            labels_test += labels[p_test[counter]]
            labels_test += '\n'
            counter += 1
        else:
            labels_test += '\n'

    with open(metrics, 'w') as f:
        f.write(metric)
    f.closed
    with open(trainlabels, 'w') as f:
        f.write(labels_train)
    f.closed
    with open(testlabels, 'w') as f:
        f.write(labels_test)
    f.closed

In [None]:
if __name__ == "__main__":
    main()