In [1]:
import tkinter as tk
from tkinter import filedialog, messagebox
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('penguins.csv')
df

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,gender,body_mass_g
0,Adelie,39.1,18.7,181,male,3750
1,Adelie,39.5,17.4,186,female,3800
2,Adelie,40.3,18.0,195,female,3250
3,Adelie,39.6,17.7,186,female,3500
4,Adelie,36.7,19.3,193,female,3450
...,...,...,...,...,...,...
145,Chinstrap,50.8,18.5,201,male,4450
146,Chinstrap,50.1,17.9,190,female,3400
147,Chinstrap,49.0,19.6,212,male,4300
148,Chinstrap,51.5,18.7,187,male,3250


In [3]:
# label encoding
df['gender'] = df['gender'].astype('category')
df['gender'] = df['gender'].cat.codes
df['species'] = df['species'].astype('category')
df['species'] = df['species'].cat.codes

# removing nans
df['gender'][df['gender'] == -1] = df['gender'].max()
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['gender'][df['gender'] == -1] = df['gender'].max()


Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,gender,body_mass_g
0,0,39.1,18.7,181,1,3750
1,0,39.5,17.4,186,0,3800
2,0,40.3,18.0,195,0,3250
3,0,39.6,17.7,186,0,3500
4,0,36.7,19.3,193,0,3450
...,...,...,...,...,...,...
145,1,50.8,18.5,201,1,4450
146,1,50.1,17.9,190,0,3400
147,1,49.0,19.6,212,1,4300
148,1,51.5,18.7,187,1,3250


In [4]:
df.insert(loc=1, column='bias', value=1)

In [5]:
C1 = df[df['species'] == 1]
C2 = df[df['species'] == 2]

C1_X = C1[['bias', 'bill_length_mm', 'flipper_length_mm']]
C1_D = C1['species']

C2_X = C2[['bias', 'bill_length_mm', 'flipper_length_mm']]
C2_D = C2['species']


In [6]:
C1_X_Train, C1_X_Test, C1_D_Train, C1_D_Test = train_test_split(C1_X, C1_D, test_size=0.4)
C2_X_Train, C2_X_Test, C2_D_Train, C2_D_Test = train_test_split(C2_X, C2_D, test_size=0.4)

In [7]:
print(f'C1_X_Train: {C1_X_Train.shape}')
print(f'C1_X_Test: {C1_X_Test.shape}')
print(f'C1_D_Train: {C1_D_Train.shape}')
print(f'C1_D_Test: {C1_D_Test.shape}')
print(f'C2_X_Train: {C2_X_Train.shape}')
print(f'C2_X_Test: {C2_X_Test.shape}')
print(f'C2_D_Train: {C2_D_Train.shape}')
print(f'C2_D_Test: {C2_D_Test.shape}')

C1_X_Train: (30, 3)
C1_X_Test: (20, 3)
C1_D_Train: (30,)
C1_D_Test: (20,)
C2_X_Train: (30, 3)
C2_X_Test: (20, 3)
C2_D_Train: (30,)
C2_D_Test: (20,)


In [8]:
X_Train = pd.concat([C1_X_Train, C2_X_Train]).to_numpy()
D_Train = pd.concat([C1_D_Train, C2_D_Train]).to_numpy()
X_Test = pd.concat([C1_X_Test, C2_X_Test]).to_numpy()
D_Test = pd.concat([C1_D_Test, C2_D_Test]).to_numpy()

In [9]:
W = np.random.random(size=(3, ))

In [10]:
W

array([0.98640626, 0.93721981, 0.0568443 ])

In [11]:
W.shape

(3,)

In [12]:
nb_epochs = 1000
eta = 0.001

In [13]:
f'old weights: {W}'

'old weights: [0.98640626 0.93721981 0.0568443 ]'

In [14]:
signum  = np.vectorize(lambda x: 1 if x > 0 else 0)

In [15]:
for i in range(nb_epochs):
    net = np.dot(X_Train, W)
    signum  = np.vectorize(lambda x: 1 if x > 0 else 0)
    y = signum(net)
    error = D_Train - y
    for x, e in zip(X_Train, error):
        W = W + eta * e * x

In [16]:
f'new weights: {W}'

'new weights: [  30.98640626 1420.33721981 6450.0568443 ]'

In [17]:
nb_correct = 0
for x, d in zip(X_Test, D_Test):
    net = np.dot(x, W)
    y = signum(net)
    if y == d:
        nb_correct +=1
accuracy = nb_correct / X_Test.shape[0]

In [18]:
(nb_correct, X_Test.shape[0])

(20, 40)

In [19]:
f'accuray: {accuracy}'

'accuray: 0.5'