# Assignment 2

(For each question, run sequentially)

For windows system, you may need to change the dataset path to "hwk2_datasets\\\\\[file_name\]"

## Question 1

Generate train, validation and test dataset based on mean vectors and covariance matrix given.

The dataset is shuffled and partitioned with 60% train, 20% validation and 20% test.

Also the data of classNegative is labeled to negative and classPositive is labeled to positive. (Adding -1s and 1s to the last column)

In [1]:
import numpy as np
import csv
import random
from sklearn import preprocessing

In [2]:
# Helper function to read CSV file and returning a np array containing the data
def csvReader(filepath):
    with open(filepath) as csvFile:
        dataSet = csv.reader(csvFile, delimiter=',')
        data = []
        for row in dataSet:
            inputTemp = []
            for i in range(len(row)):
                if(row[i] != ''):   # somehow the initial csv data has empty string, need this step to filter it
                    inputTemp.append(float(row[i]))
            data.append(inputTemp)
    data = np.array(data, dtype='float')
    return data


# Question 1 - Generate dataset
def generateDataset(covPath, m0Path, m1Path):
    cov = csvReader(covPath)
    mean0 = csvReader(m0Path)
    mean1 = csvReader(m1Path)
    c0 = np.random.multivariate_normal(mean0[0], cov, 2000)
    classNegative = -1 * np.ones((2000, 21), dtype='float')
    classNegative[:,:-1] = c0
    c1 = np.random.multivariate_normal(mean1[0], cov, 2000)
    classPositive = np.ones((2000, 21), dtype='float')
    classPositive[:,:-1] = c1

    random.shuffle(classNegative)
    random.shuffle(classPositive)
    testBound = int(len(classNegative) * 0.2)
    validBound = int(len(classNegative) * 0.2)

    testSet = []
    validSet = []
    trainSet = []
    for i in range(2000):
        if (i < testBound):
            testSet.append(classNegative[i])
            testSet.append(classPositive[i])
        elif (i >= testBound and i < testBound + validBound):
            validSet.append(classNegative[i])
            validSet.append(classPositive[i])
        else:
            trainSet.append(classNegative[i])
            trainSet.append(classPositive[i])

    random.shuffle(testSet)
    random.shuffle(trainSet)
    random.shuffle(validSet)

    with open('hwk2_datasets/DS1-test.csv', 'w') as csvfile:
        csvWriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for i in range(len(testSet)):
            csvWriter.writerow(testSet[i])

    with open('hwk2_datasets/DS1-valid.csv', 'w') as csvfile:
        csvWriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for i in range(len(validSet)):
            csvWriter.writerow(validSet[i])

    with open('hwk2_datasets/DS1-train.csv', 'w') as csvfile:
        csvWriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for i in range(len(trainSet)):
            csvWriter.writerow(trainSet[i])

The dataset generated are 'DS1-test.csv', 'DS1-valid.csv' and 'DS1-train.csv'.

## Question 2

First, calculate the parameters w0 and w1 based on equations on slides.

(Before calculating  w0 and w1, we need to calculate P1, P2, mean1, mean2 and covariance matrix.)

In [3]:
def gda(filePath):
    trainData = csvReader(filePath)
    sum1 = np.zeros(trainData[0].shape, dtype='float')
    sum2 = np.zeros(trainData[0].shape, dtype='float')
    count1 = 0;
    count2 = 0;
    for i in range(len(trainData)):
        if (trainData[i][len(trainData[0])-1] == -1):
            sum1 = np.add(sum1,trainData[i])
            count1 += 1
        elif (trainData[i][len(trainData[0])-1] == 1):
            sum2 = np.add(sum2, trainData[i])
            count2 += 1
    m1 = np.divide(sum1, count1)
    m2 = np.divide(sum2, count2)
    m1 = m1[:-1].reshape(1,-1) # remove labeled 1s and -1s at the last column
    m2 = m2[:-1].reshape(1, -1)
    P1 = count1/len(trainData)
    P2 = count2/len(trainData)
    sum = np.zeros((len(m1[0]), len(m2[0])))
    for i in range(len(trainData)):
        part1 = np.subtract(trainData[i][:-1], m1)
        part2 = np.subtract(trainData[i][:-1], m2)
        S1 = np.dot(np.transpose(part1), part1)
        S2 = np.dot(np.transpose(part2), part2)
        sum = np.add(np.add(S1, S2), sum)

    cov = np.divide(sum, count1+count2)
    cov_inv = np.linalg.inv(cov)
    term1 = 1/2 * np.dot(np.dot(m1, cov_inv), np.transpose(m1))
    term2 = 1/2 * np.dot(np.dot(m2, cov_inv), np.transpose(m2))
    w0 = np.subtract(term2, term1) + np.log(P1) - np.log(P2)
    w1 = np.dot(cov_inv, np.transpose(np.subtract(m1, m2)))
    w = [w0, w1]
    print('w0 = ' + str(w0[0][0]))
    print('w1 = ' + str(w1))
    return w

Then we validate the performance of our model by applying test set that we generated before.

In [4]:
def validateGDA(filepath, w):
    data = csvReader(filepath)
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    w0 = w[0]
    w1 = w[1]
    for i in range(len(data)):
        a = w0[0][0] + np.dot(data[i][:-1], w1)
        if (a > 0):
            if(data[i][len(data[0]) - 1] == -1):
                TN += 1
            else:
                FN += 1
        elif (a < 0):
            if (data[i][len(data[0]) - 1] == 1):
                TP += 1
            else:
                FP += 1
    accuracy = (TP + TN) / (TP + TN + FN + FP)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1measure = (2*precision*recall) / (precision+recall)
    print('Accuracy = ' + str(accuracy))
    print('Precision = ' + str(precision))
    print('Recall = ' + str(recall))
    print('F1 measure = ' + str(F1measure))


w = gda('hwk2_datasets/DS1-train.csv')
validateGDA('hwk2_datasets/DS1-test.csv', w)


w0 = 1.9463125787853652
w1 = [[ 1.02193507]
 [-0.61584269]
 [-0.44156509]
 [-0.27127862]
 [-0.74051604]
 [-0.29672473]
 [ 1.30321243]
 [-1.71359568]
 [-2.13595237]
 [ 0.62525576]
 [-0.91837962]
 [-0.91067288]
 [ 1.1556343 ]
 [ 0.99645741]
 [-0.38852566]
 [ 0.91743229]
 [ 2.18270508]
 [-0.48336646]
 [-0.1325924 ]
 [-0.33832297]]
Accuracy = 0.95875
Precision = 0.9508599508599509
Recall = 0.9675
F1 measure = 0.9591078066914498


## Question 3

By applying Euclidean Distance equation, we can obtain an array of distance and find the k nearest neighbors. And then assign the input x to relative class (the most common class among neighbors). 

In [5]:
def KNN(X_train, X_test, k):
    d = []
    for i in range(len(X_train)):
        sum = 0
        for j in range(len(X_test) - 1):
            sum += np.square(X_train[i][j]-X_test[j])
        d.append([np.sqrt(sum), i, X_train[i][len(X_train[0]) - 1]])

    d = sorted(d)
    nn = []
    for i in range (k):
        nn.append(d[i][2])
    prediction = np.sum(nn)
    if (prediction > 0):
        return 1
    else:
        return -1

Now we can use the KNN classifier to classify each data sample in the test set and compare with the actual class.

The function will also find the best K in range 1 to 20 (based on F1 measure), however, since the computation takes really long time, I'll set the range to 5 here.

(Note: we need to normalize the dataset in KNN classifier, I used sklearn library here)

In [6]:
def validateKNN(trainDataset, testDataset):
    trainData = csvReader(trainDataset)
    trainData[:, :-1] = preprocessing.normalize(trainData[:, :-1], axis=0)  # need to normalize the data
    testData = csvReader(testDataset)
    testData[:, :-1] = preprocessing.normalize(testData[:, :-1], axis=0)

    bestk = 0
    bestF1 = 0

    for i in range (1, 6):
        TP = 0
        TN = 0
        FP = 0
        FN = 0
        for j in range(len(testData)):
            prediction = KNN(trainData, testData[j], i)
            if (prediction > 0):
                if (trainData[j][len(trainData[0]) - 1] == -1):
                    TN += 1
                else:
                    FN += 1
            else:
                if (trainData[j][len(trainData[0]) - 1] == 1):
                    TP += 1
                else:
                    FP += 1
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        F1measure = (2 * precision * recall) / (precision + recall)
        print('k = ' + str(i))
        print(' F1 Measure = ' + str(F1measure))
        if (F1measure > bestF1):
            bestF1 = F1measure
            bestk = i

    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for i in range(len(testData)):
        prediction = KNN(trainData, testData[i], bestk)
        if (prediction > 0):
            if (trainData[i][len(trainData[0]) - 1] == -1):
                TN += 1
            else:
                FN += 1
        else:
            if (trainData[i][len(trainData[0]) - 1] == 1):
                TP += 1
            else:
                FP += 1
    accuracy = (TP + TN) / (TP + TN + FN + FP)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1measure = (2 * precision * recall) / (precision + recall)
    print('Best K = ' + str(bestk))
    print('Accuracy = ' + str(accuracy))
    print('Precision = ' + str(precision))
    print('Recall = ' + str(recall))
    print('F1 measure = ' + str(F1measure))

In [7]:
validateKNN('hwk2_datasets/DS1-train.csv', 'hwk2_datasets/DS1-test.csv')

k = 1
 F1 Measure = 0.4673913043478261
k = 2
 F1 Measure = 0.5360824742268041
k = 3
 F1 Measure = 0.4775725593667546
k = 4
 F1 Measure = 0.5166666666666667
k = 5
 F1 Measure = 0.478494623655914
Best K = 2
Accuracy = 0.49375
Precision = 0.4746450304259635
Recall = 0.6157894736842106
F1 measure = 0.5360824742268041


When K = 2, it gives the best fit with F1 measure = 0.5360824742268041

## Question 4

Similar to Question 1, generate dataset 2 by a mixture of 3 Gaussians with mixture probability (0.1, 0.42, 0.48)

In [8]:
def generateData(m11, m12, m13, m21, m22, m23, c1, c2, c3):
    mean11 = csvReader(m11)
    mean12 = csvReader(m12)
    mean13 = csvReader(m13)
    mean21 = csvReader(m21)
    mean22 = csvReader(m22)
    mean23 = csvReader(m23)
    cov1 = csvReader(c1)
    cov2 = csvReader(c2)
    cov3 = csvReader(c3)
    class11 = np.random.multivariate_normal(mean11[0], cov1, 2000)
    class12 = np.random.multivariate_normal(mean12[0], cov2, 2000)
    class13 = np.random.multivariate_normal(mean13[0], cov3, 2000)
    class21 = np.random.multivariate_normal(mean21[0], cov1, 2000)
    class22 = np.random.multivariate_normal(mean22[0], cov2, 2000)
    class23 = np.random.multivariate_normal(mean23[0], cov3, 2000)
    class1_temp = []
    class2_temp = []

    for i in range(2000):
        choice = np.random.choice([1,2,3,], 1, p=[0.1,0.42,0.48])
        if (choice == 1):
            class1_temp.append(class11[i])
            class2_temp.append(class21[i])
        elif (choice == 2):
            class1_temp.append(class12[i])
            class2_temp.append(class22[i])
        else:
            class1_temp.append(class13[i])
            class2_temp.append(class23[i])

    classNegative = -1 * np.ones((2000, 21), dtype='float')
    classNegative[:,:-1] = class1_temp
    classPositive = np.ones((2000, 21), dtype='float')
    classPositive[:,:-1] = class2_temp

    random.shuffle(classNegative)
    random.shuffle(classPositive)

    testSet = []
    validSet = []
    trainSet = []
    testBound = int(len(classNegative) * 0.2)
    validBound = int(len(classNegative) * 0.2)

    for i in range(2000):
        if (i < testBound):
            testSet.append(classNegative[i])
            testSet.append(classPositive[i])
        elif (i >= testBound and i < testBound + validBound):
            validSet.append(classNegative[i])
            validSet.append(classPositive[i])
        else:
            trainSet.append(classNegative[i])
            trainSet.append(classPositive[i])

    random.shuffle(testSet)
    random.shuffle(trainSet)
    random.shuffle(validSet)

    with open('hwk2_datasets/DS2-test.csv', 'w') as csvfile:
        csvWriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for i in range(len(testSet)):
            csvWriter.writerow(testSet[i])

    with open('hwk2_datasets/DS2-valid.csv', 'w') as csvfile:
        csvWriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for i in range(len(validSet)):
            csvWriter.writerow(validSet[i])

    with open('hwk2_datasets/DS2-train.csv', 'w') as csvfile:
        csvWriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for i in range(len(trainSet)):
            csvWriter.writerow(trainSet[i])

# generateData('hwk2_datasets/DS2_c1_m1.txt', 'hwk2_datasets/DS2_c1_m2.txt', 'hwk2_datasets/DS2_c1_m3.txt',
#              'hwk2_datasets/DS2_c2_m1.txt', 'hwk2_datasets/DS2_c2_m2.txt', 'hwk2_datasets/DS2_c2_m3.txt',
#              'hwk2_datasets/DS2_Cov1.txt', 'hwk2_datasets/DS2_Cov2.txt', 'hwk2_datasets/DS2_Cov3.txt')

## Question 5

Repeat GDA on dataset 2

In [9]:
w = gda('hwk2_datasets/DS2-train.csv')
validateGDA('hwk2_datasets/DS2-test.csv', w)

w0 = 0.00944858874154364
w1 = [[-0.01587187]
 [-0.03097841]
 [ 0.02603874]
 [-0.00485993]
 [ 0.00947184]
 [ 0.00059935]
 [ 0.01926517]
 [ 0.08241396]
 [-0.02991102]
 [ 0.01269773]
 [-0.01779393]
 [ 0.00205633]
 [ 0.00282658]
 [-0.02220273]
 [-0.0197494 ]
 [ 0.02114536]
 [-0.02914988]
 [-0.03820653]
 [ 0.01388067]
 [ 0.01200783]]
Accuracy = 0.535
Precision = 0.5333333333333333
Recall = 0.56
F1 measure = 0.5463414634146342


Everything drops significantly on dataset 2.

Repeat KNN on dataset 2

In [10]:
validateKNN('hwk2_datasets/DS2-train.csv', 'hwk2_datasets/DS2-test.csv')

k = 1
 F1 Measure = 0.5341317365269461
k = 2
 F1 Measure = 0.5896656534954406
k = 3
 F1 Measure = 0.5193236714975845
k = 4
 F1 Measure = 0.6021052631578947
k = 5
 F1 Measure = 0.5364705882352943
Best K = 4
Accuracy = 0.5275
Precision = 0.5267034990791897
Recall = 0.7027027027027027
F1 measure = 0.6021052631578947


When K = 4, it gives the best fit with F1 measure = 0.6021052631578947 

## Question 6

See report