Before we start, we need to import some necessary packages.

In [155]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Q1 Multinomial Classification

## (a)

The following code serves data import and one hot encoder for our label variable.

In [106]:
# Import data
data1 = pd.read_csv('datasethw2/data1.txt', sep = '\t', header= None)
data2 = pd.read_csv('datasethw2/data2.txt', sep = '\t', header = None)
data3 = pd.read_csv('datasethw2/data3.txt', sep = '\t', header = None)
data = pd.concat([data1, data2, data3])

# Shuffle our data
np.random.seed(0)
data = data.sample(frac = 1).reset_index(drop = True)

# One hot encoder
onehot_encoder = OneHotEncoder(sparse=False)
all_x = data.iloc[:,:-1]
all_y = onehot_encoder.fit_transform(data[5].values.reshape(-1,1))

# Dimension 
n_x = all_x.shape[1] # Features Number
n_y = all_y.shape[1] # Class Number

Then, we set up our TensorFlow Multinomial Classification model. Here, we use the cross-entropy as our cost function.

In [109]:
Y = tf.placeholder(tf.float32, shape = [None, n_y])
X = tf.placeholder(tf.float32, shape = [None, n_x])

W = tf.Variable(tf.random_uniform([n_x,n_y]))
b = tf.Variable(tf.random_uniform([n_y]))

# Soft-Max Prediction Outcome
prediction = tf.nn.softmax(tf.matmul(X, W) + b)

# Cross-entropy Cost Function
cost = tf.reduce_mean(\
                      -tf.reduce_sum(Y * tf.log(prediction), axis = 1))

# Accuracy
accuracy = \
tf.reduce_mean(\
               tf.cast(\
                       tf.equal(tf.argmax(Y,1),tf.argmax(prediction,1)),\
                       tf.float32))

optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(cost)

In [128]:
accuracy_record = np.zeros(10)
W_record = np.zeros([n_x,n_y,10])
b_record = np.zeros([n_y,10])

In [129]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for step in range(10):
    train_x, test_x, train_y, test_y = train_test_split(all_x, all_y, 
                                                        test_size = 0.2)
    for epoch in range(10000):
        sess.run([optimizer], feed_dict= {X: train_x, 
                                          Y: train_y})
    W_hat, b_hat = sess.run([W,b])
    W_record[:,:,step] = W_hat
    b_record[:,step] = b_hat
    accuracy_record[step] = sess.run(accuracy, 
                                     feed_dict = {X:test_x, Y: test_y}) 
    

In [145]:
print('Accuracy','\n',accuracy_record)

Accuracy 
 [ 0.75        0.78333336  0.81666666  0.85000002  0.80000001  0.85000002
  0.78333336  0.83333331  0.85000002  0.73333335]


## (b)

In [144]:
print('Best W:','\n',
      W_record[:,:,np.argmax(accuracy_record)],'\n',
     'Best b', '\n',
      b_record[:,np.argmax(accuracy_record)])


Best W: 
 [[ 0.258131    0.68533254  0.16778211]
 [ 0.15310754  0.45301342  0.36581433]
 [-0.83655965  1.44814765 -0.18005736]
 [-0.75564283  1.22818983  0.68093216]
 [-0.01445479  0.51505649  0.46779269]] 
 Best b 
 [ 5.62745571 -5.59471416  2.40941286]


# Q2 Multinomial Classification & Feature Reduction

We will use the 5-fold cross validation to find the best 4 features. First, we use our training data, and randomly divide them into 5 folds. At fold i, we use other 4 folds data as our training data, and test our model in fold i data. The feature selection metric is the average accuracy of 5-fold cross validation.

## (a)

In [311]:
np.random.seed(0)
train_x, test_x, train_y, test_y = \
train_test_split(all_x, all_y, test_size = 0.2)

In [313]:
Y = tf.placeholder(tf.float32, shape = [None, n_y])
X = tf.placeholder(tf.float32, shape = [None, n_x-1])

W = tf.Variable(tf.random_uniform([n_x-1,n_y]))
b = tf.Variable(tf.random_uniform([n_y]))

# Soft-Max Prediction Outcome
prediction = tf.nn.softmax(tf.matmul(X, W) + b)

# Cross-entropy Cost Function
cost = tf.reduce_mean(
    -tf.reduce_sum(Y * tf.log(prediction), axis = 1))

# Accuracy
accuracy = \
tf.reduce_mean(tf.cast(tf.equal(tf.argmax(Y,1),\
                     tf.argmax(prediction,1)),\
                       tf.float32))

optimizer = \
tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(cost)

In [315]:
k = 0
accuracy_record = np.zeros([5,5])
kf = KFold(n_splits = 5, shuffle = False,
               random_state = 1)
for train_index, test_index in kf.split(train_x):
    cv_train_x, cv_test_x = \
    train_x.iloc[train_index,:],train_x.iloc[test_index,:]
    cv_train_y, cv_test_y = \
    train_y[train_index], train_y[test_index]
    for i in range(5):
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        for epoch in range(10000):
            sess.run([optimizer], 
                     feed_dict= \
                     {X: cv_train_x.drop(cv_train_x.columns[[i]], axis=1, inplace=False),\
                      Y: cv_train_y})
        accuracy_record[k,i] = \
        sess.run(accuracy,feed_dict ={X:cv_test_x.drop(cv_test_x.columns[[i]],\
                                                       axis=1, inplace=False),\
                                      Y: cv_test_y})
        # accuracy_record[k,i] means accuracy of model which excludes i-th column at kth fold data
    k = k+1
    
        
        

In [324]:
accuracy_record
np.mean(accuracy_record, axis = 0)


array([ 0.75833333,  0.77916669,  0.75      ,  0.73333334,  0.77500001])

The best four features model includes variable 1, varialbe 3, variable 4, variable 5 since this model has the highest accuracy in 5-fold cross validation.

In [307]:
accuracy_record = np.zeros(10)
W_record = np.zeros([n_x-1,n_y,10])
b_record = np.zeros([n_y,10])

sess = tf.Session()
sess.run(tf.global_variables_initializer())
for step in range(10):
    train_x, test_x, train_y, test_y = train_test_split(all_x, all_y, test_size = 0.2)
    for epoch in range(10000):
        sess.run([optimizer],
                 feed_dict= \
                 {X: train_x.drop(train_x.columns[[1]], \
                                  axis = 1,inplace = False),\
                  Y: train_y})
    W_hat, b_hat = sess.run([W,b])
    W_record[:,:,step] = W_hat
    b_record[:,step] = b_hat
    accuracy_record[step] = sess.run(accuracy,\
                                     feed_dict = \
                                     {X:test_x.drop(test_x.columns[[4]],\
                                                    axis = 1,inplace = False),\
                                      Y: test_y}) 

In [308]:
print('Accuracy','\n',accuracy_record)

Accuracy 
 [ 0.43333334  0.5         0.48333332  0.55000001  0.44999999  0.56666666
  0.53333336  0.38333333  0.5         0.43333334]


## (b)

In [206]:
print('Best W:','\n',W_record[:,:,np.argmax(accuracy_record)],'\n',
     'Best b', '\n',b_record[:,np.argmax(accuracy_record)])

Best W: 
 [[ 0.53749949  0.97714525  0.51486504]
 [-0.53402317  1.85327792  0.37368262]
 [-0.13448872  1.42349064  1.08562398]
 [ 0.37720126  0.75136161  0.70024669]] 
 Best b 
 [ 4.51101494 -4.61911535  1.93547416]


## (C)

We checked the average accuracy of Q2 is 0.8083 while the average accuracy of Q1 is 0.8050. The best four features model includes variable 1, varialbe 3, variable 4, variable 5 since this model has the highest accuracy in 5-fold cross validation.

# Q3 SVM

Here we use GridSeach Cross Validation to do tuning our SVM models. The hyperparameters we want to tune is following: Kernel type (Linear and Radial), Cost C, and Gamma for Radial Kernel Basis. 

In [325]:
from __future__ import print_function
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

In [326]:
train_x, test_x, train_y, test_y = \
train_test_split(all_x,data[5], test_size = 0.5)


In [331]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1,1e-2,1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1e-1,1, 10, 100, 1000]}]

scores = ['accuracy']


In [332]:
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                       scoring= score)
    clf.fit(train_x, train_y)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = test_y, clf.predict(test_x)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'C': 1, 'kernel': 'linear'}

Grid scores on development set:

0.720 (+/-0.179) for {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
0.793 (+/-0.152) for {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.473 (+/-0.093) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.373 (+/-0.039) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.713 (+/-0.125) for {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
0.813 (+/-0.066) for {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
0.780 (+/-0.169) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.473 (+/-0.093) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.713 (+/-0.125) for {'C': 100, 'gamma': 1, 'kernel': 'rbf'}
0.760 (+/-0.122) for {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
0.807 (+/-0.171) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.760 (+/-0.180) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.713 (+/-0.125) for {'C': 1000, 'gamma': 1, 'kernel': 'rbf'}
0.740 (+/-0.092) for {'C': 1000,