In [138]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

In [139]:
# Load dataset file
dataset = pd.read_csv("penguins.csv")

In [140]:
# Check if the dataset instances are properly distributed
print(dataset.groupby('sex').size())

sex
.           1
FEMALE    165
MALE      168
dtype: int64


In [141]:
#Data is not equal. I.e there are 165 Female species and 168 male species. While 1 has no gender
#We convert the to female that has no gender
dataset['sex'] = dataset['sex'].replace(['.'],'FEMALE')

In [142]:
# Now we have just 2 genders. Male and Female. But not equal number of records. We remove 2 male species
print(dataset.groupby('species').size())

species
Adelie       152
Chinstrap     68
Gentoo       124
dtype: int64


In [143]:
#Drop records where sex value is null
dataset=dataset.dropna(subset=["sex"]) #In data, if sex=NA is removed, then no null column is left
# Now again see the number of records
print(dataset.groupby('sex').size())

sex
FEMALE    166
MALE      168
dtype: int64


In [144]:
dataset.head(20)
#No attribute is Null
dataset.isnull().any()

species              False
island               False
culmen_length_mm     False
culmen_depth_mm      False
flipper_length_mm    False
body_mass_g          False
sex                  False
dtype: bool

In [588]:
# Create input (X) and output_labels (Y) vectors
feature_columns = ['culmen_length_mm','culmen_depth_mm','flipper_length_mm','body_mass_g']
#culmen_length_mm	culmen_depth_mm	flipper_length_mm	body_mass_g

X = dataset[feature_columns].values
y = dataset['species'].values


In [589]:
# Apply numerical encoding to convert alphabetical names
le = LabelEncoder()
y = le.fit_transform(y)

In [590]:
# Divide the dataset in testing and training vectors for cross-validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [591]:
# Instantiate learning model (k = 3)
classifier = KNeighborsClassifier(n_neighbors=7)

In [592]:
# Fitting the model
classifier.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=7)

In [593]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [594]:
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.svm import SVC
#from sklearn.linear_model import LogisticRegression





classifier = KNeighborsClassifier(n_neighbors=5)
#classifier = LinearDiscriminantAnalysis()
#classifier = DecisionTreeClassifier()
#classifier = SVC(gamma = 'auto')
#classifier = LogisticRegression()



# Fitting the model
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)


In [595]:
# Create and view the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[46  1  3]
 [13  5  0]
 [ 4  0 29]]


In [596]:
# View the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)*100
print('Accuracy of our model is equal ' + str(round(accuracy, 2)) + ' %.')

Accuracy of our model is equal 79.21 %.


In [None]:
#If we classify the penguins with 20% data for testing and 80% for training and keeping n=3 by following attributes,
#then accuracy for each tril is

#culmen_length_mm=76.12%
#culmen_depth_mm=79.1%
#flipper_length_mm=79.1%
#body_mass_g=73.13%
'''
CL+CD=97.01%
CL+FL=97.01%
CL+BM=80.6%
CD+FL=83.58%
CD+BM=70.15%
FL+BM=76.12%
CL+CD+FL=97.01%
CL+CD+BM=82.09%
CL+FL+BM=82.09%
CD+FL+BM=76.12%
CL+CD+FL+BM=83.58%
'''
#Where 
'''
CL=cumen_length_mm
CD=cumen_depth_mm
FL=flipper_length_mm
BM=body_mass_g
'''


In [None]:
#If we classify the penguins with 30% data for testing and 70% for training and keeping n=5 by following attributes,
#then accuracy for each tril is

#culmen_length_mm=73.27%
#culmen_depth_mm=76.24%
#flipper_length_mm=80.2%
#body_mass_g=72.28%
'''
CL+CD=97.03%
CL+FL=97.03%
CL+BM=78.22%
CD+FL=82.18%
CD+BM=70.3%
FL+BM=73.27%
CL+CD+FL=97.03%
CL+CD+BM=78.22%
CL+FL+BM=78.22%
CD+FL+BM=74.26%
CL+CD+FL+BM=79.21%
'''
#Where 
'''
CL=cumen_length_mm
CD=cumen_depth_mm
FL=flipper_length_mm
BM=body_mass_g
'''

In [597]:
#If we classify the penguins with 30% data for testing and 70% for training and keeping n=7 by following attributes,
#then accuracy for each tril is

#culmen_length_mm=73.27%
#culmen_depth_mm=76.24%
#flipper_length_mm=80.2%
#body_mass_g=72.28%
'''
CL+CD=97.03%
CL+FL=97.03%
CL+BM=78.22%
CD+FL=82.18%
CD+BM=70.3%
FL+BM=73.27%
CL+CD+FL=97.03%
CL+CD+BM=78.22%
CL+FL+BM=78.22%
CD+FL+BM=74.26%
CL+CD+FL+BM=79.21%
'''
#Where 
'''
CL=cumen_length_mm
CD=cumen_depth_mm
FL=flipper_length_mm
BM=body_mass_g
'''

'\nCL+CD=97.03%\nCL+FL=97.03%\nCL+BM=78.22%\nCD+FL=82.18%\nCD+BM=70.3%\nFL+BM=73.27%\nCL+CD+FL=97.03%\nCL+CD+BM=78.22%\nCL+FL+BM=78.22%\nCD+FL+BM=74.26%\nCL+CD+FL+BM=\n'