In [24]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
# Import dataset
df = pd.read_csv('../data/weight-height.csv')

In [3]:
# Dump the first lines of the dataset
df.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [6]:
# Label Encode Data
label_encoder = preprocessing.LabelEncoder()

df['SexCode'] = label_encoder.fit_transform(df['Gender'])

In [7]:
# Dump the first lines of the dataset
df.head()

Unnamed: 0,Gender,Height,Weight,SexCode
0,Male,73.847017,241.893563,1
1,Male,68.781904,162.310473,1
2,Male,74.110105,212.740856,1
3,Male,71.730978,220.04247,1
4,Male,69.881796,206.349801,1


In [15]:
# Get attributes columsn for training and the target class
# The SexCode column is the target class
x = df.iloc[:,1:3].values
y = df.iloc[:,3]

In [30]:
# Split the dataset into training and test data
# The test_size=0.25 says that 25% of the dataset will be used for evaluating
# the model accuracy
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [29]:
# Instanciating the models
tree_model = DecisionTreeClassifier()
svm_model = SVC(gamma='auto')
kn_model = KNeighborsClassifier()
lg_model = LogisticRegression()

In [31]:
# Traning models on the data with the fit function
tree_model.fit(x_train,y_train)
svm_model.fit(x_train,y_train)
kn_model.fit(x_train,y_train)
lg_model.fit(x_train,y_train)

In [32]:
# Predicting the test data and trying to get the target class value right
tree_prediction = tree_model.predict(x_test)
svm_prediction = svm_model.predict(x_test)
kn_prediction = kn_model.predict(x_test)
lg_prediction = lg_model.predict(x_test)

In [33]:
# Evaluates the accuracy of the model, how much can it get right
tree_result = accuracy_score(y_test, tree_prediction)
svm_result = accuracy_score(y_test, svm_prediction)
kn_result = accuracy_score(y_test, kn_prediction)
lg_result = accuracy_score(y_test, lg_prediction)

In [34]:
# Print results
print("tree :"+str(tree_result))
print("svm :"+str(svm_result))
print("kn :"+str(kn_result))
print("lg :"+str(lg_result))

tree :0.8708
svm :0.918
kn :0.9016
lg :0.9184
