In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from collections import Counter

import functools

import math

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

In [2]:
inputFile = "water-treatmennt-original-marked.csv"

dataFrame = pd.read_csv(inputFile, header = 0, sep = ';')
print(dataFrame.shape)
data = dataFrame.values

print("First few data rows:")
print(data[0:10, 0:5])

numSamples = dataFrame.shape[0]
print("Num samples == " + str(numSamples))
numFeatures = dataFrame.shape[1] - 2  #first feature is sampling date and last feature used as class marker
print("Num features == " + str(numFeatures))

regularizationStrength = 1
learningRate = 0.000001

(527, 40)
First few data rows:
[['D-1/3/90' 44101.0 1.5 7.8 nan]
 ['D-2/3/90' 39024.0 3.0 7.7 nan]
 ['D-4/3/90' 32229.0 5.0 7.6 nan]
 ['D-5/3/90' 35023.0 3.5 7.9 205.0]
 ['D-6/3/90' 36924.0 1.5 8.0 242.0]
 ['D-7/3/90' 38572.0 3.0 7.8 202.0]
 ['D-8/3/90' 41115.0 6.0 7.8 nan]
 ['D-9/3/90' 36107.0 5.0 7.7 215.0]
 ['D-11/3/90' 29156.0 2.5 7.7 206.0]
 ['D-12/3/90' 39246.0 2.0 7.8 172.0]]
Num samples == 527
Num features == 38


In [3]:
##########################################Removal of useless features#####################

Y = dataFrame.iloc[:, -1]  # Class markers
X = dataFrame.iloc[:, 1:-1]  # Features without date and class markers

numClasses = np.amax(Y)
print("Num classes: {0}".format(numClasses))

print(X.iloc[0:10, 0:2])

##########################################Normalization#####################

normalizedX = MinMaxScaler().fit_transform(X)
X = pd.DataFrame(normalizedX)

print(X.iloc[0:10, 0:2])

##########################################Missing data handling#####################

for sampleId in range(X.shape[0]):
    for featureId in range(numFeatures):
        if (math.isnan(X.iloc[sampleId, featureId])):
            X.iloc[sampleId, featureId] = 0.5

Num classes: 13.0
   Q-E      (input flow to plant)   ZN-E     (input Zinc to plant)
0                         44101.0                              1.5
1                         39024.0                              3.0
2                         32229.0                              5.0
3                         35023.0                              3.5
4                         36924.0                              1.5
5                         38572.0                              3.0
6                         41115.0                              6.0
7                         36107.0                              5.0
8                         29156.0                              2.5
9                         39246.0                              2.0
          0         1
0  0.680598  0.041916
1  0.579121  0.086826
2  0.443305  0.146707
3  0.499151  0.101796
4  0.537147  0.041916
5  0.570087  0.086826
6  0.620915  0.176647
7  0.520817  0.146707
8  0.381883  0.071856
9  0.583558  0.056886


In [4]:
def addFeatureForBias(data):
    extendedData = np.zeros((data.shape[0], data.shape[1] + 1))
    extendedData[:, 0:-1] = data
    extendedData[:, -1] = int(1)
    
    return extendedData

# test = pd.DataFrame([[0, 0],
#                      [0, 0]])
# print(addFeatureForBias(test))

X = addFeatureForBias(X)

In [5]:
def computeSoftMarginCost(weights, X, Y):
    # Hinge loss
    N = X.shape[0]
    distances = 1 - Y * (np.dot(X, weights))
    
    numClassificationErrors = len(distances[distances < 0])
    
    distances[distances < 0] = 0  # equivalent to max(0, distance)
    
    dangerousDistances = distances[distances < 1]
    
    numClassificationWarnings = len(dangerousDistances[dangerousDistances > 0])
    
    hinge_loss = regularizationStrength * (np.sum(distances) / N)
    
    # Soft margin cost function
    cost = 1 / 2 * np.dot(weights, weights) + hinge_loss
    return cost, numClassificationWarnings, numClassificationErrors

def computeCostFunctionGradient(weights, xBatch, yBatch):
#     print(type(xBatch))
#     print(type(yBatch))
    
    # In case of SGD
    if (type(yBatch) != np.ndarray):
        yBatch = np.array([yBatch])
        xBatch = np.array([xBatch])
        
    distances = 1 - (yBatch * np.dot(xBatch, weights))
    
    weightsDelta = np.zeros(len(weights))
    
    for index, distance in enumerate(distances):
        if (max(0, distance) == 0):
            deltaI = weights
        else:
            deltaI = weights - (regularizationStrength * yBatch[index] * xBatch[index])
            
        weightsDelta += deltaI
        
    weightsDelta = weightsDelta / len(yBatch)
    
    return weightsDelta

def stochasticGradientDescent(features, outputs):
    maxEpochs = 1024
    weights = np.zeros(features.shape[1])

    prevCost = float("inf")
    convergenceThreshold = 0.01 #percents
    
    classificationWarnings = 0
    classificationErrors = 0
    
    for epoch in range(maxEpochs): 
        # To prevent repeating update cycles
        X, Y = shuffle(features, outputs)
        
        for ind, x in enumerate(X):
            gradientAscent = computeCostFunctionGradient(weights, x, Y[ind])
            weights = weights - (learningRate * gradientAscent)
            
        if ((epoch % 100 == 0) or (epoch >= maxEpochs - 1)):
            cost, classificationWarnings, classificationErrors = computeSoftMarginCost(weights, 
                                                                                       features, 
                                                                                       outputs)
            
            print("Epoch #{0}: cost == {1}".format(epoch, cost))
            print("Epoch #{0}: Num classification errors == {1}".format(epoch, classificationErrors))
            print("Epoch #{0}: Num samples inside borders == {1}".format(epoch, classificationWarnings))
            
            if (abs(prevCost - cost) < convergenceThreshold * prevCost):
                return weights, classificationWarnings, classificationErrors
            
            prevCost = cost
            
    return weights, classificationWarnings, classificationErrors

In [6]:
def classifyOneVsAll(oneClassId):
    yOneVsAll = list(map(lambda y: -1.0 if y == oneClassId else 1.0, Y))

    # print(yFirstVsAll)
    # print(type(X.values))
    # print(type(yFirstVsAll))

    X_train, X_test, y_train, y_test = train_test_split(X, yOneVsAll, test_size=0.3, random_state=42)

    print("training started...")
    print("Num samples: {0}".format(X_train.shape[0]))
    
    W, warnings, errors = stochasticGradientDescent(X_train, y_train)
    
    if (errors > 0):
        return False
    
    print("training finished.")
    print("weights are: {}".format(W))

    y_test_predicted = [0] * X_test.shape[0]

    for i in range(X_test.shape[0]):
        yp = np.sign(np.dot(W, X_test[i]))
        y_test_predicted[i] = yp

    # print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
    
    return True

In [7]:
for classId in range(1, int(numClasses) + 1):
    print(">>>Trying to split Class#{0} from other classes...".format(classId))
    if (False == classifyOneVsAll(classId)):
        print(">>>Classification of Class#{0} vs All failed".format(classId))

>>>Trying to split Class#1 from other classes...
training started...
Num samples: 368
Epoch #0: cost == 0.9999740688324595
Epoch #0: Num classification errors == 0
Epoch #0: Num samples inside borders == 195
Epoch #100: cost == 0.9974751021860854
Epoch #100: Num classification errors == 0
Epoch #100: Num samples inside borders == 195
training finished.
weights are: [-6.01799537e-04  1.09237194e-04 -7.76653774e-04 -1.39961936e-03
 -9.97577076e-04  5.09797337e-04 -3.92331636e-03  6.96262297e-05
 -8.67018924e-04 -4.29643330e-04 -1.13303020e-03  5.99184078e-04
 -3.44773987e-03  4.89132557e-05 -9.70287366e-04 -4.88114100e-04
 -2.27117745e-03 -2.89318617e-03 -3.90996794e-04 -2.91325748e-03
 -5.66545617e-06 -1.12985387e-03 -5.51117948e-06 -7.30931442e-04
 -9.71689789e-04 -4.33764053e-04 -2.15363047e-03 -3.39739822e-06
 -7.90907354e-04  7.21617169e-04 -8.51722783e-05 -2.23911989e-03
 -1.67460187e-03 -1.98355452e-03 -1.38753854e-03 -1.58955059e-03
 -1.35895611e-03 -2.21037476e-03 -2.18122189e-0

Epoch #300: cost == 0.3390781934724125
Epoch #300: Num classification errors == 14
Epoch #300: Num samples inside borders == 311
Epoch #400: cost == 0.28972624815751263
Epoch #400: Num classification errors == 213
Epoch #400: Num samples inside borders == 112
Epoch #500: cost == 0.28887308154468805
Epoch #500: Num classification errors == 239
Epoch #500: Num samples inside borders == 86
>>>Classification of Class#9 vs All failed
>>>Trying to split Class#10 from other classes...
training started...
Num samples: 368
Epoch #0: cost == 0.9959982678717269
Epoch #0: Num classification errors == 0
Epoch #0: Num samples inside borders == 368
Epoch #100: cost == 0.6103387683979867
Epoch #100: Num classification errors == 0
Epoch #100: Num samples inside borders == 368
Epoch #200: cost == 0.25204441415569073
Epoch #200: Num classification errors == 0
Epoch #200: Num samples inside borders == 368
Epoch #300: cost == 0.06727996459743996
Epoch #300: Num classification errors == 248
Epoch #300: Num 