In [1]:
import pandas as pd
import numpy as np
import scipy.sparse
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score


In [2]:
original_dataset = pd.read_csv("training.csv")

In [3]:
dup_dataset =  original_dataset.copy()

training_data, testing_data = train_test_split(dup_dataset, test_size = .2, random_state = 42)

possible_classes = training_data[training_data.columns[-1]].unique()
no_words = training_data.shape[1] -2 


In [4]:
no_classes = len(possible_classes) #number of classes

d_number = training_data.shape[0] 

required_list = list(training_data[training_data.columns[-1]]) 

value_d= np.zeros((no_classes, d_number)) #matrix filled with zero's

count = 0
for rl in required_list:
    # for class label on the count, set index = 1
    value_d[rl-1, count] = 1
    count += 1

value_d = scipy.sparse.csr_matrix(value_d) #sparse matrix

#dropping columns that are not necessary
training_data = training_data.drop('1', 1)

training_data = training_data.drop('14', 1)
#creating sparse with training data
actual_req_train = scipy.sparse.csr_matrix(training_data)
#normalising the actual_req_train
add_c = np.array(actual_req_train.sum(axis=0))[0,:]
add_c[add_c==0]=1
actual_req_train /= add_c

nor_actual_req_train = actual_req_train
   
weights = scipy.sparse.csr_matrix(np.zeros((no_classes, no_words), dtype=np.float64))

In [5]:
# Lists of learning_rate and penalty_terms for tuning logistic regression.
list_lr = [.0001, .001, .0025, .0050, .0075, .01, .1]
list_p = [.0001, .001, .0025, .0050, .0075, .01, .1]


updates = 100 # Number of weight updates in logistic regression
lr = .01 # Learning or eta term
pr = .005 #


In [6]:
for i in range(updates):
        # matrix of probabilities, P( Y | W, X) ~ exp(W * X^T)
        Z_term = np.expm1(weights.dot(actual_req_train.transpose()))
        #normalize Z
        add_c = np.array(Z_term.sum(axis=0))[0,:] # column vector
        add_c[add_c==0]=1
        Z_term /= add_c
        # = Z / Z.sum(axis=0)
        # gradient w.r.t. Weights with regularization
        gradient = ((value_d - Z_term) * actual_req_train) - (pr * weights)
        # learning rule
        weights = weights + (lr * gradient)

In [7]:
train_weights =weights
# 20% of training set for calculating training accuracy

test_classes = list(testing_data[testing_data.columns[-1]])
testing_data_copy = testing_data.copy()

testing_data = testing_data.drop('1', 1)
testing_data = testing_data.drop('14', 1)

values_guessed = np.expm1(train_weights.dot(testing_data.transpose()))
# take maximum and get index for every example
maximum_index = values_guessed.argmax(axis=0).ravel().tolist()
result = []
for i in range(values_guessed.shape[1]):
    result.append(maximum_index[0][i] + 1)
    
values_pred = result
predictions_array = np.array(values_pred)
testing_data_results = np.array(testing_data_copy.iloc[:,-1])


accuracy_score = accuracy_score(testing_data_results, predictions_array)
print(accuracy_score)  # Training accuracy
#drawing the confusion matrix for fixed lr,pr values.
confusion_matrix = confusion_matrix(testing_data_results,predictions_array)
print(confusion_matrix)


0.525
[[ 7  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  0  0  0  0]
 [ 1  1  0  0  0  3  0  0  0  0  0  1  0  0  0  2  0  2  0  0]
 [ 1  0  5  1  0  2  0  0  0  0  0  0  0  1  0  3  0  0  0  0]
 [ 0  0  0  9  0  1  0  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 1  0  0  2  2  0  0  0  0  0  0  0  0  0  0  3  0  0  0  0]
 [ 0  0  0  0  1  4  0  0  0  0  0  0  0  0  0  2  0  0  0  0]
 [ 1  1  0  0  0  0  1  0  0  1  0  0  2  0  0  4  1  0  1  0]
 [ 0  0  0  0  0  0  0 10  0  0  0  0  0  0  0  2  1  0  0  0]
 [ 0  0  0  0  0  0  0  1  6  0  0  0  0  0  0  3  0  0  1  0]
 [ 0  0  0  0  0  0  0  0  0  8  0  0  1  0  0  3  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  8  0  0  0  0  0  0  2  1  0]
 [ 1  0  0  0  0  0  0  0  0  0  0  7  0  0  0  3  0  0  1  0]
 [ 0  0  0  1  0  0  0  0  0  0  0  0  5  0  0  2  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  4  0  6  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  1  0  1  4  0]
 [ 2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  9

In [9]:
# Loading the testing dataset
actual_testing_data = pd.read_csv("testing.csv", header=None)

In [11]:
testing_copy = actual_testing_data.copy()
testing_copy = testing_copy.drop(0, 1)

predicted = np.expm1(train_weights.dot(testing_copy.transpose()))
# take maximum and get index for every example
maximum_index = predicted.argmax(axis=0).ravel().tolist()
result = []
for i in range(predicted.shape[1]):
    result.append(maximum_index[0][i] + 1)
    
final_predictions = result

# Saving the predictions to the csv file
result_frame = pd.DataFrame(final_predictions)
result_frame.to_csv("predicted_result_logistic_mani.csv")


In [None]:
tuning_values = []
for lr in list_lr:
    for pr in list_p:
        dup_dataset =  original_dataset.copy()

        training_data, testing_data = train_test_split(dup_dataset, test_size = .2, random_state = 42)

        possible_classes = training_data[training_data.columns[-1]].unique()
        no_words = training_data.shape[1] -2 
        
        training_copy = training_data.copy()
        test_copy = testing_data.copy()
        possible_classes = training_data[training_data.columns[-1]].unique()
        no_words = training_data.shape[1] -2
        no_classes = len(possible_classes)
        d_number = training_data.shape[0]

        required_list = list(training_data[training_data.columns[-1]])
        value_d= np.zeros((no_classes, d_number))

        count = 0
        # go through each examples classification and 
        # index into the matrix delta and set that indice to 1
        # need to subtract 1 from the label because labels are 1-indexed
        for rl in required_list:
            # for class label on the count, set index = 1
            value_d[rl-1, count] = 1
            count += 1

        value_d = scipy.sparse.csr_matrix(delta)

        training_data = training_data.drop('1', 1)

        training_data = training_data.drop('14', 1)

        actual_req_train = scipy.sparse.csr_matrix(training_data)
        #
        add_c = np.array(actual_req_train.sum(axis=0))[0,:] # column vector
        add_c[add_c==0]=1
        actual_req_train /= add_c

        actual_req_train = actual_req_train

        #    
        weights = scipy.sparse.csr_matrix(np.zeros((no_classes, no_words), dtype=np.float64))

        print (weights.shape, actual_req_train.shape)
        training_accuracies = []
        for i in range(updates):
            # matrix of probabilities, P( Y | W, X) ~ exp(W * X^T)
            Z_term = np.expm1(weights.dot(actual_req_train.transpose()))
            #normalize Z
            add_c = np.array(Z_term.sum(axis=0))[0,:] # column vector
            add_c[add_c==0]=1
            Z_term /= add_c
            # = Z / Z.sum(axis=0)
            # gradient w.r.t. Weights with regularization
            gradient = ((value_d - Z_term) * actual_req_train) - (pr * weights)
            # learning rule
            weights = weights + (lr * gradient)
                
        train_weights =weights
        # 20% of training set for calculating training accuracy

        test_classes = list(testing_data[testing_data.columns[-1]])

        testing_data = testing_data.drop('1', 1)
        testing_data = testing_data.drop('14', 1)

        values_guessed = np.expm1(train_weights.dot(testing_data.transpose()))
        # take maximum and get index for every example
        maximum_index = predicted.argmax(axis=0).ravel().tolist()
        result = []
        for i in range(predicted.shape[1]):
            result.append(maximum_index[0][i] + 1)

        values_pred = result
            
        count = 0
        for i in range(len(test_classes)):
            if test_classes[i] == values_pred[i]: # Comparing predicted class with original class
                count +=1

        training_accuracy = float(count) / len(test_classes)
        training_accuracies.append(training_accuracy)

        optimal_acc = training_accuracies
        print ( lr, pr, max(optimal_acc))
        tuning_values.append((lr, pr, optimal_acc))    