In [54]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split

dataset = pd.read_csv('car_data.csv', delimiter=',', header=None, index_col=0).values
classes = list(set(dataset[:, -1]))
classMap = dict(zip(classes, range(len(classes))))
print(classMap)

{'bus': 0, 'saab': 1, 'opel': 2, 'van': 3}


In [55]:
def getTwoClassMap(classMap, names):
    if (len(names) != 2):
        print("must be two classes")
    return dict([(names[0], -1), (names[1], 1)])

def convert(x, y, classMap):
    res_x = list()
    res_y = list()
    for i in range (len(y)):
        classLabel = classMap.get(y[i])
        if (classLabel != None):
            res_y.append(classLabel)
            #inserting 1 as bias
            res_x.append(np.insert(x[i], 0, 1, axis = 0))
    return normalize(np.array(res_x)), np.array(res_y)

def normalize(x):
    y = np.transpose(x.astype(float))
    for i in range(y.shape[0]):
        y[i] /= np.max(y[i])
    return np.transpose(y)

def calculateClassAm(classes, classArr):
    ctr = np.zeros(len(classes))
    for className in classArr:
        for i in range(len(classes)):
            if (classes[i] == className):
                ctr[i] = ctr[i] + 1
    for i in range(len(classes)):
        print("\tclass ", classes[i], " : ", ctr[i])

In [56]:
calculateClassAm(classes, dataset[:, -1])

	class  bus  :  218.0
	class  saab  :  217.0
	class  opel  :  212.0
	class  van  :  199.0


In [57]:
print(dataset[:,-1].shape)

(846,)


In [58]:
#will working with bus and van
workingPair = classes[:2]
twoClassMap = getTwoClassMap(classMap, workingPair)
print("working with ", twoClassMap)

data, labels = convert(dataset[:, :-1].astype(int), dataset[:, -1], twoClassMap)
print("data shape ", data.shape)
print("labels shape ", labels.shape)
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)
print("x_train shape ", x_train.shape)
print("x_test shape ", x_test.shape)

working with  {'bus': -1, 'saab': 1}
data shape  (435, 19)
labels shape  (435,)
x_train shape  (348, 19)
x_test shape  (87, 19)


In [59]:
def sigmoid(x):
    return 1. / (1. + math.exp(-x))

def dsigmoid(x):
    return math.exp(-x) / (1. + math.exp(-x))**2

def dsigmoid_opt(sigmoid_value):
    return sigmoid_value* (1 - sigmoid_value)

def margin(x,y,w):
    return y*x.dot(w)
    
def logloss(x,y,w):
    res = 0
    for i in range(len(y)):
        res = res + math.log(sigmoid(margin(x[i], y[i], w)))
    return res

def dlogloss(x,y,w):
    res = np.zeros(w.shape)
    for i in range(len(w)):
        for j in range(len(y)):
            sigma = sigmoid(margin(x[j], y[j], w))
            res[i] = res[i] + dsigmoid_opt(sigma) / sigma * y[j] * x[j][i]
    return res
            
def logReg(x, y, eps, step, w0, changeStep):
    w = np.zeros(x.shape[1])
    w[0] = w0
    prev_w = np.zeros(x.shape[1])
    diff_norm = eps
    i = 0
    while (diff_norm >= eps):
        if (changeStep):
            step = step / (i + 1)**0.3
        prev_w = w
        w = w + step * dlogloss(x,y,w)
        log_loss = logloss(x,y,w)
        diff_norm = np.linalg.norm(w - prev_w)
        if (i % 100 == 0):
            print("iter ", i, " log loss = ", log_loss, " diff_norm = ", diff_norm)
        i = i + 1
    return w

In [60]:
w_res = logReg(x_train,y_train, 0.004, 0.002, 0.5, False)

iter  0  log loss =  -244.9898283309022  diff_norm =  0.2581604237245165
iter  100  log loss =  -202.89857181501597  diff_norm =  0.019663793650147407
iter  200  log loss =  -189.46064745084067  diff_norm =  0.014003790238188325
iter  300  log loss =  -181.42685105856208  diff_norm =  0.011624004623929976
iter  400  log loss =  -175.47059282120762  diff_norm =  0.010318853120535562
iter  500  log loss =  -170.60731260555573  diff_norm =  0.009462208017126102
iter  600  log loss =  -166.44085078213038  diff_norm =  0.008827447989698037
iter  700  log loss =  -162.77481519412848  diff_norm =  0.008319661229328356
iter  800  log loss =  -159.49526824546382  diff_norm =  0.007893730581161265
iter  900  log loss =  -156.52809320028413  diff_norm =  0.007525486000667204
iter  1000  log loss =  -153.82107876183105  diff_norm =  0.007200537048060123
iter  1100  log loss =  -151.33536406030376  diff_norm =  0.006909558286856618
iter  1200  log loss =  -149.04086994294696  diff_norm =  0.0066461

In [61]:
def replaceOnLabels(y):
    res = np.zeros(len(y), dtype = int)
    for i in range(len(y)):
        if (y[i] < 0):
            res[i] = -1
        else:
            res[i] = 1
    return res

In [62]:
result = replaceOnLabels(x_test.dot(w_res))
print("result = ", w_res)


result =  [ 0.57370236  3.46819139 -2.74040866 11.77392062  0.78731224 -8.84756256
  6.62511999  0.03649157  4.20556388  1.90305374 -0.96290196 -4.30596082
 -1.54153713 -3.07380966 -9.05194546  2.67050111  1.10482283 -2.9584885
  2.52470713]


In [63]:
def accuracy(res, real):
    correct = 0.
    for i in range (len(res)):
        if (res[i] == real[i]):
            correct = correct + 1
    return correct / len(res)

def getClassIndex(value, classes):
    for classIndex in range(len(classes)):
        if (classes[classIndex] == value):
            return classIndex
    print("class ", value, " not found")
    return -1
    
def confusionMatrix(res, real, classes):
    classAm = len(classes)
    matrix = np.zeros(classAm * classAm).reshape(classAm, classAm)
    for i in range(len(res)):
        classIndexRes = getClassIndex(res[i], classes)
        if (res[i] == real[i]):
            matrix[classIndexRes][classIndexRes] = matrix[classIndexRes][classIndexRes] + 1
        else:
            classIndexReal = getClassIndex(real[i], classes)
            matrix[classIndexRes][classIndexReal] = matrix[classIndexRes][classIndexReal] + 1
    return matrix


def precision(matrix, classIndex):
    return matrix[classIndex][classIndex] / np.sum(matrix, axis = 1)[classIndex]

def recall(matrix, classIndex):
    return matrix[classIndex][classIndex] / np.sum(matrix, axis = 0)[classIndex]

In [64]:
print("accuracy: ", accuracy(result, y_test))
print("classes are: ", twoClassMap)
classValues = np.fromiter(twoClassMap.values(), dtype=int)
confusion_matrix = confusionMatrix(result,y_test, classValues)
print("confusion Matrix:\n",  confusion_matrix)
print("precision: ", precision(confusion_matrix, classValues[0]))
print("recall: ", recall(confusion_matrix, classValues[0]))

accuracy:  0.9310344827586207
classes are:  {'bus': -1, 'saab': 1}
confusion Matrix:
 [[39.  4.]
 [ 2. 42.]]
precision:  0.9545454545454546
recall:  0.9130434782608695


## Multiclass classification: 
### part 1. One vs Rest 

In [65]:
def getOneVsRestMap(classMap, targetClass):
    result = dict()
    for className in classMap.keys():
        if (className == targetClass):
            result[className] = 1
        else:
            result[className] = -1
    return result
            
def convertClassNamesToLabels(classArr, classMap):
    res = np.zeros(classArr.shape)
    for i in range(len(classArr)):
        res[i] = classMap.get(classArr[i])
    return res

In [66]:
allData = convert(dataset[:, :-1].astype(int), dataset[:, -1], classMap)[0]
x_train_total, x_test_total, y_train_total, y_test_total = train_test_split(allData, dataset[:,-1], test_size=0.2)

y_test_arr = np.zeros(len(classes), dtype = "object")
y_train_arr = np.zeros(len(classes), dtype = "object")
w_arr = np.zeros(len(classes), dtype = "object")

ctr = 0
for className in classes:
    y_train_arr[ctr] = convertClassNamesToLabels(y_train_total, getOneVsRestMap(classMap, className))
    y_test_arr[ctr] = convertClassNamesToLabels(y_test_total, getOneVsRestMap(classMap, className))
    print("======================== \n")
    print("working with class ", className)
    print("x_train shape :", x_train_total.shape, " y_train shape: ", y_train_arr[ctr].shape)
    w_arr[ctr] = logReg(x_train_total, y_train_arr[ctr], 0.004, 0.002, 0.5, False)
    ctr = ctr + 1


working with class  bus
x_train shape : (676, 19)  y_train shape:  (676,)
iter  0  log loss =  -639.639104117179  diff_norm =  1.4534349753357647
iter  100  log loss =  -335.6636069842865  diff_norm =  0.25630884697082035
iter  200  log loss =  -306.60175022093273  diff_norm =  0.02194139623777289
iter  300  log loss =  -290.97444653398446  diff_norm =  0.016653537060933477
iter  400  log loss =  -278.41297403824166  diff_norm =  0.015128806570280338
iter  500  log loss =  -267.8972998875237  diff_norm =  0.013925808115731474
iter  600  log loss =  -258.9097113966373  diff_norm =  0.012925549088774758
iter  700  log loss =  -251.11638848961178  diff_norm =  0.01207315208030654
iter  800  log loss =  -244.28103250669017  diff_norm =  0.011335362749995453
iter  900  log loss =  -238.22863803379238  diff_norm =  0.01068925966660939
iter  1000  log loss =  -232.82579642342336  diff_norm =  0.010118109540980396
iter  1100  log loss =  -227.96853595003256  diff_norm =  0.009609291718358948


iter  1200  log loss =  -181.4601247568792  diff_norm =  0.00946628814614206
iter  1300  log loss =  -177.1668382093123  diff_norm =  0.009076197461520793
iter  1400  log loss =  -173.21379068114317  diff_norm =  0.00871594048368558
iter  1500  log loss =  -169.562827073502  diff_norm =  0.008382473205775759
iter  1600  log loss =  -166.18106610518075  diff_norm =  0.008073144070744453
iter  1700  log loss =  -163.04004354886445  diff_norm =  0.0077856267263293295
iter  1800  log loss =  -160.11501789353386  diff_norm =  0.00751786924890962
iter  1900  log loss =  -157.3844017736331  diff_norm =  0.007268054288408996
iter  2000  log loss =  -154.82929262088936  diff_norm =  0.007034566859524169
iter  2100  log loss =  -152.43308269243244  diff_norm =  0.006815967744399386
iter  2200  log loss =  -150.1811332744723  diff_norm =  0.006610971177922744
iter  2300  log loss =  -148.06050120547692  diff_norm =  0.006418425906017261
iter  2400  log loss =  -146.05970834243993  diff_norm =  0.

In [67]:
for i in range(len(classes)):
    print("\nfor class ", classes[i])
    print("w = ", w_arr[i])
    res = replaceOnLabels(x_test_total.dot(w_arr[i]))
    print("accuracy = ", accuracy(res, y_test_arr[i]))
    confusion_matrix_one_vs_rest = confusionMatrix(res, y_test_arr[i], np.array([1, -1]))
    print("confusion matrix:\n", confusion_matrix_one_vs_rest)
    print("precision: ", precision(confusion_matrix_one_vs_rest, 1))
    print("recall: ", recall(confusion_matrix_one_vs_rest, 1))


for class  bus
w =  [  3.27644107  -5.96254188   0.85025349 -18.22905206  -1.40870678
  13.51309915 -12.38802954   2.1687746  -12.61859978  -1.71276931
  -2.73593528   8.0717905    1.13570163   5.14068189  12.2160373
  -2.29402227  -1.02648771   7.82267027  -2.20697906]
accuracy =  0.888235294117647
confusion matrix:
 [[ 35.   6.]
 [ 13. 116.]]
precision:  0.8992248062015504
recall:  0.9508196721311475

for class  saab
w =  [ 0.82673173  3.58904173 -4.36550587  3.60599632  3.52968832 -3.50316712
  0.48307152  0.79987199 -0.4584361   1.40124498 -5.52732913 -0.71438018
 -0.73733954  1.7236649  -4.97993285  1.20772771  0.88627629 -1.32002469
  1.6346872 ]
accuracy =  0.7941176470588235
confusion matrix:
 [[  9.   5.]
 [ 30. 126.]]
precision:  0.8076923076923077
recall:  0.9618320610687023

for class  opel
w =  [ 2.07738839 -7.77234584  1.9493152   2.76149793  2.32708378 -3.94179582
  0.11374353  2.49446144  0.61739985  1.70361887 -0.53367401 -1.29449219
  2.0137379  -3.72388851 -3.765663

In [68]:
def getResultClass(classes, w_arr, x_test):
    result = np.zeros(x_test.shape[0], dtype = "object")
    classProbability = x_test.dot(w_arr[0])
    for i in range(1, len(classes)):
        classProbability = np.vstack((classProbability, x_test.dot(w_arr[i])))
    classProbability = classProbability.T
    print(classProbability.shape)
    for i in range(result.shape[0]):
        bestClass = 0
        for j in range(len(classes)):
            if (j == 0):
                maxProb = sigmoid(classProbability[i][j])
            if (sigmoid(classProbability[i][j]) > maxProb):
                maxProb = sigmoid(classProbability[i][j])
                bestClass = j
        result[i] = classes[bestClass]
    return result

In [69]:
oneVsRestResult = getResultClass(classes, w_arr, x_test_total)
print("accuracy = ", accuracy(oneVsRestResult, y_test_total))
print("distribution of classes in the sample ")
calculateClassAm(classes, y_test_total)
final_oneVsRest_confusionMatrix = confusionMatrix(oneVsRestResult, y_test_total, classes)
print("confusion matrix for classes ", classes, " is:\n", final_oneVsRest_confusionMatrix)
for i in range(len(classes)):
    print("for class ", classes[i])
    print("precision : ", precision(final_oneVsRest_confusionMatrix, i))
    print("recall : ", recall(final_oneVsRest_confusionMatrix, i))

(170, 4)
accuracy =  0.7058823529411765
distribution of classes in the sample 
	class  bus  :  48.0
	class  saab  :  39.0
	class  opel  :  47.0
	class  van  :  36.0
confusion matrix for classes  ['bus', 'saab', 'opel', 'van']  is:
 [[43.  7.  3.  2.]
 [ 2. 26. 20.  0.]
 [ 1.  3. 17.  0.]
 [ 2.  3.  7. 34.]]
for class  bus
precision :  0.7818181818181819
recall :  0.8958333333333334
for class  saab
precision :  0.5416666666666666
recall :  0.6666666666666666
for class  opel
precision :  0.8095238095238095
recall :  0.3617021276595745
for class  van
precision :  0.7391304347826086
recall :  0.9444444444444444


### part 2. One vs One

In [17]:
oneVsOneMaps = [
    getTwoClassMap(classMap, classes[:2]),
    getTwoClassMap(classMap, classes[1:3]),
    getTwoClassMap(classMap, classes[2:4]),
    getTwoClassMap(classMap, [classes[0],classes[2]]),
    getTwoClassMap(classMap, [classes[0],classes[3]]),
    getTwoClassMap(classMap, [classes[1],classes[3]])
               ]
oneVsOneMaps

[{'bus': -1, 'saab': 1},
 {'saab': -1, 'opel': 1},
 {'opel': -1, 'van': 1},
 {'bus': -1, 'opel': 1},
 {'bus': -1, 'van': 1},
 {'saab': -1, 'van': 1}]

In [18]:
def convertForOneToOne(x, y, classMap):
    res_x = list()
    res_y = list()
    for i in range(len(y)):
        if (classMap.get(y[i]) != None):
            res_x.append(x[i])
            res_y.append(classMap.get(y[i]))
        
    return normalize(np.array(res_x)), np.array(res_y)

In [19]:
arrSize = len(oneVsOneMaps)
data_arr = np.zeros(arrSize, dtype = "object")
label_arr = np.zeros(arrSize, dtype = "object")
x_train_arr = np.zeros(arrSize, dtype = "object")
y_train_arr = np.zeros(arrSize, dtype = "object")
x_test_arr = np.zeros(arrSize, dtype = "object")
y_test_arr = np.zeros(arrSize, dtype = "object")
w_res_arr = np.zeros(arrSize, dtype = "object")
x_train_total, x_test_total, y_train_total, y_test_total = train_test_split(allData, dataset[:,-1], test_size=0.2)


for i in range(arrSize):
    print("\n==================")
    print("classes are: ", oneVsOneMaps[i])
    data_arr[i], label_arr[i] = convertForOneToOne(x_train_total, y_train_total, oneVsOneMaps[i])
    x_train_arr[i], x_test_arr[i], y_train_arr[i], y_test_arr[i] = train_test_split(data_arr[i], label_arr[i], test_size=0.1)
    w_res_arr[i] = logReg(x_train_arr[i],y_train_arr[i], 0.003, 0.002, 0.5, False)
    result = replaceOnLabels(x_test_arr[i].dot(w_res_arr[i]))
    print("accuracy: ", accuracy(result, y_test_arr[i]))
    classValues = np.fromiter(oneVsOneMaps[i].values(), dtype=int)
    confusion_matrix = confusionMatrix(result,y_test_arr[i], classValues)
    print("confusion Matrix:\n",  confusion_matrix)
    print("precision: ", precision(confusion_matrix, classValues[0]))
    print("recall: ", recall(confusion_matrix, classValues[0]))


classes are:  {'bus': -1, 'saab': 1}
iter  0  log loss =  -220.34021433763638  diff_norm =  0.2338683472301085
iter  100  log loss =  -181.54334993761555  diff_norm =  0.019604807798785904
iter  200  log loss =  -168.3858522632286  diff_norm =  0.013656166277529313
iter  300  log loss =  -160.97735919441791  diff_norm =  0.010980451622333597
iter  400  log loss =  -155.79351512982765  diff_norm =  0.00951665582853132
iter  500  log loss =  -151.72074686175114  diff_norm =  0.008602588098606164
iter  600  log loss =  -148.30419549081876  diff_norm =  0.007969244917344867
iter  700  log loss =  -145.32504856058074  diff_norm =  0.007493369424876624
iter  800  log loss =  -142.66416520093492  diff_norm =  0.007113424488019891
iter  900  log loss =  -140.249873104858  diff_norm =  0.006796320591575251
iter  1000  log loss =  -138.03541849735248  diff_norm =  0.006523023476697676
iter  1100  log loss =  -135.9882448776391  diff_norm =  0.006281896186884742
iter  1200  log loss =  -134.0844

iter  3500  log loss =  -71.08552243794233  diff_norm =  0.0035780600353828573
iter  3600  log loss =  -70.45621293634018  diff_norm =  0.0035185157744346655
iter  3700  log loss =  -69.84747368781518  diff_norm =  0.0034610873156820706
iter  3800  log loss =  -69.25825537071678  diff_norm =  0.0034056688818820914
iter  3900  log loss =  -68.68757807102605  diff_norm =  0.003352161687391417
iter  4000  log loss =  -68.13452564533554  diff_norm =  0.003300473317487548
iter  4100  log loss =  -67.59824063924034  diff_norm =  0.0032505171806276514
iter  4200  log loss =  -67.07791969501585  diff_norm =  0.003202212023109034
iter  4300  log loss =  -66.5728093918167  diff_norm =  0.003155481497351569
iter  4400  log loss =  -66.08220246945604  diff_norm =  0.003110253776395492
iter  4500  log loss =  -65.60543439341109  diff_norm =  0.003066461208417863
iter  4600  log loss =  -65.14188022425782  diff_norm =  0.003024040005995416
accuracy:  0.9393939393939394
confusion Matrix:
 [[17.  0.]


iter  4200  log loss =  -62.84098012988635  diff_norm =  0.0034077240979970497
iter  4300  log loss =  -62.270094153922216  diff_norm =  0.00335141868928331
iter  4400  log loss =  -61.71772517989027  diff_norm =  0.0032971750104281
iter  4500  log loss =  -61.182912290421235  diff_norm =  0.0032448828820092223
iter  4600  log loss =  -60.66476082964479  diff_norm =  0.003194439640375195
iter  4700  log loss =  -60.16243681236919  diff_norm =  0.0031457495316400633
iter  4800  log loss =  -59.67516188228142  diff_norm =  0.0030987231606250955
iter  4900  log loss =  -59.20220875848467  diff_norm =  0.003053276989348623
iter  5000  log loss =  -58.74289711706461  diff_norm =  0.003009332880206195
accuracy:  1.0
confusion Matrix:
 [[15.  0.]
 [ 0. 18.]]
precision:  1.0
recall:  1.0

classes are:  {'saab': -1, 'van': 1}
iter  0  log loss =  -202.24850896815508  diff_norm =  0.30523699101744817
iter  100  log loss =  -143.18693430170367  diff_norm =  0.020808136878603212
iter  200  log los

In [52]:
import operator

def getInvMap(inputMap):
    return {v: k for k, v in inputMap.items()}

def getPredictedClass(prediction, twoMap):
    invMap = getInvMap(twoMap)
    if (prediction < 0):
        return invMap.get(-1)
    else:
        return invMap.get(1)
    
def predictOneToOne(w_arr, maps, x_test, classes):
    classProbability = x_test.dot(w_arr[0])
    result = np.zeros(x_test.shape[0], dtype = "object")
    for i in range(1, len(maps)):
        classProbability = np.vstack((classProbability, x_test.dot(w_arr[i])))
    classProbability = classProbability.T
    print(classProbability.shape)
    for i in range(len(result)):
        predictionDist = {k : 0. for k in classes}
        for j in range(len(maps)):
            probability = sigmoid(abs(classProbability[i][j]))                
            predictedClass = getPredictedClass(classProbability[i][j], maps[j])
            predictionDist[predictedClass] = predictionDist[predictedClass] + probability
#         print("predictionDist: ", predictionDist)
        finalPrediction = max(predictionDist.items(), key=operator.itemgetter(1))[0]
#         print("finalPrediction: ", finalPrediction)
        result[i] = finalPrediction
    return result


In [53]:
oneVsOneResult = predictOneToOne(w_res_arr, oneVsOneMaps, x_test_total,classes)
print("accuracy = ", accuracy(oneVsOneResult, y_test_total))
print("distribution of classes in the sample ")
calculateClassAm(classes, y_test_total)
final_oneVsOne_confusionMatrix = confusionMatrix(oneVsOneResult, y_test_total, classes)
print("confusion matrix for classes ", classes, " is:\n", final_oneVsOne_confusionMatrix)
for i in range(len(classes)):
    print("for class ", classes[i])
    print("precision : ", precision(final_oneVsOne_confusionMatrix, i))
    print("recall : ", recall(final_oneVsOne_confusionMatrix, i))

(170, 6)
accuracy =  0.6294117647058823
distribution of classes in the sample 
	class  bus  :  43.0
	class  saab  :  41.0
	class  opel  :  42.0
	class  van  :  44.0
confusion matrix for classes  ['bus', 'saab', 'opel', 'van']  is:
 [[27.  1.  1.  1.]
 [ 1.  2.  0.  0.]
 [ 6. 27. 35.  0.]
 [ 9. 11.  6. 43.]]
for class  bus
precision :  0.9
recall :  0.627906976744186
for class  saab
precision :  0.6666666666666666
recall :  0.04878048780487805
for class  opel
precision :  0.5147058823529411
recall :  0.8333333333333334
for class  van
precision :  0.6231884057971014
recall :  0.9772727272727273
