In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from bokeh.plotting import figure, show, Figure
from bokeh.models import ColumnDataSource, Label
from bokeh.io import output_notebook
output_notebook()

In [2]:
dataFrameTraining = pd.read_csv("./data/DWH_Training.csv")
dataFrameTesting = pd.read_csv("./data/DWH_Test.csv")
maleHeightWeightMap = {
    'height': [],
    'weight': []
}

femaleHeightWeightMap = {
    'height': [],
    'weight': []
}

In [3]:
dataFrameTraining.head()

Unnamed: 0,index,height(cm),weight(kg),gender
0,1,173.685185,69.15806,-1
1,2,168.295454,55.384885,1
2,3,170.213057,63.701701,1
3,4,168.673799,52.104581,1
4,5,174.53074,70.698139,-1


In [4]:
dataFrameTesting.head()

Unnamed: 0,index,height(cm),weight(kg),gender,misc
0,1,174.731425,68.346183,-1,4.055077
1,2,167.961304,61.551896,1,-3.245166
2,3,167.291562,69.278931,1,-3.284287
3,4,169.423971,58.349019,1,-2.047819
4,5,175.86838,69.440505,-1,5.277262


In [5]:
def getClusterCentroid(dataFrameTraining):
    
    positiveSamplesCentroid = [0.0,0.0]
    negativeSamplesCentroid = [0.0,0.0]
    
    negativeSamplesCount = 0.0
    
    for index,row in dataFrameTraining.iterrows():
        
        if row['gender'] == -1:
            negativeSamplesCentroid[0]+=row['height(cm)']
            negativeSamplesCentroid[1]+=row['weight(kg)']
            negativeSamplesCount+=1
            femaleHeightWeightMap['height'].append(row['height(cm)'])
            femaleHeightWeightMap['weight'].append(row['weight(kg)'])
            
        else:
            positiveSamplesCentroid[0]+=row['height(cm)']
            positiveSamplesCentroid[1]+=row['weight(kg)']
            maleHeightWeightMap['height'].append(row['height(cm)'])
            maleHeightWeightMap['weight'].append(row['weight(kg)'])
    
    negativeSamplesCentroid[0]/=negativeSamplesCount
    negativeSamplesCentroid[1]/=negativeSamplesCount
    
    positiveSamplesCentroid[0]/=(len(dataFrameTraining)-negativeSamplesCount)
    positiveSamplesCentroid[1]/=(len(dataFrameTraining)-negativeSamplesCount)
    
    return positiveSamplesCentroid,negativeSamplesCentroid
    

In [6]:
def classify(w, b, dataFrameTraining):
    
    classificationResult = []

    for index, row in dataFrameTraining.iterrows():
        
        result = ((w[0]*row['height(cm)'] + w[1]*row['weight(kg)'])+b)
        if result > 0:
            classificationResult.append(1)
        else:
            classificationResult.append(-1)
    
    return classificationResult
        

In [7]:
def getHyperPlaneCoordinates(w, b, dataFrameTraining):
    
    height = []
    weight = []
    
    for index, row in dataFrameTraining.iterrows():
        
        x = row['height(cm)']
        y = (-(w[0]/w[1])*x)-(b/w[1])
        height.append(x)
        weight.append(y)
    
    return height,weight
    

In [8]:
positiveCentroid,negativeCentroid = getClusterCentroid(dataFrameTraining)
w = [0,0]
w[0] = (2*(positiveCentroid[0] - negativeCentroid[0]))
w[1] = (2*(positiveCentroid[1] - negativeCentroid[1]))

b = (math.pow(negativeCentroid[0],2) + math.pow(negativeCentroid[1],2)) - \
    (math.pow(positiveCentroid[0],2) + math.pow(positiveCentroid[1],2)) 

height,weight = getHyperPlaneCoordinates(w,b,dataFrameTraining)

classificationResult = classify(w, b, dataFrameTraining)

In [9]:
p = figure(x_axis_label='height(cm)',y_axis_label='weight(kg)', width=800, height=500,
         title='Relation between Male and Female',
        )
source = ColumnDataSource(data=maleHeightWeightMap)
p.circle(x='height',y='weight',source=source,color="blue", legend="Male")
source = ColumnDataSource(data=femaleHeightWeightMap)
p.circle(x='height',y='weight',color="green",source=source, legend="Female")
p.line(x=height,y=weight,line_width=2, color="black")
show(p)

In [10]:
testResults = classify(w,b, dataFrameTesting)

correctClassificationCount = 0
for i,row in dataFrameTesting.iterrows():
    
    if testResults[i] == row['gender']:
        correctClassificationCount +=1

print("Test Accuracy(%):",((correctClassificationCount/len(testResults)))*100)
        

Test Accuracy(%): 75.55555555555556
