In [None]:
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, pair_confusion_matrix, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, validation_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

plt.style.use('dark_background')

%run DataCleaningFunctions.ipynb

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/cdavidshaffer/CPSC4970-AI/master/data/penguins.csv")
data.info()
data.head(10)

<h2><u>Data Cleaning</u></h2>

In [None]:
# check text column unique values
textual_columns = ['species', 'island', 'sex']
print_unique_values(textual_columns, data)

['Adelie' 'Chinstrap' 'Gentoo']
['Torgersen' 'Biscoe' 'Dream']
['MALE' 'FEMALE' '?' '_']


In [None]:
# Replace invalid text entries
replace_characters = ['?', '_']

for char in replace_characters:
    replace_data_values(data, 'sex', char, None)
data.info()

In [None]:
# convert text columns to string type
for i in textual_columns:
    data[i] = data[i].astype('string')
data.info()

In [None]:
# clean numerical columns from non-numerical values
numerical_columns = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']
check_numerical_column_errors(data, numerical_columns)
data.info()

In [None]:
# Drop all rows with missing values
data = data.dropna()
data.info()

<h3><u>Data Preparation</u></h3>

In [None]:
X = data.drop("species", axis=1)
y = data["species"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
display(X_train.shape) 
display(X_test.shape)
display(y_train.shape)
display(y_test.shape)

(266, 6)

(67, 6)

(266,)

(67,)

In [None]:
X_train_numerical = X_train[numerical_columns]
X_test_numerical = X_test[numerical_columns]
display(X_train_numerical.describe())
display(X_test_numerical.describe())

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
count,266.0,266.0,266.0,266.0
mean,44.497744,17.134962,201.808271,4253.289474
std,5.36593,2.020036,13.755167,793.77055
min,34.0,13.1,172.0,2700.0
25%,39.725,15.3,190.25,3650.0
50%,45.35,17.45,198.0,4100.0
75%,49.075,18.7,214.0,4850.0
max,59.6,21.5,230.0,6300.0


Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
count,67.0,67.0,67.0,67.0
mean,41.98806,17.283582,197.626866,4023.507463
std,5.451731,1.762109,14.638923,829.976732
min,32.1,13.7,174.0,2850.0
25%,37.7,16.25,187.5,3375.0
50%,41.3,17.2,193.0,3800.0
75%,45.75,18.5,207.5,4612.5
max,54.3,21.1,231.0,5950.0


In [None]:
# separate numerical and non-numerical data
X_train_nonnumerical = X_train.drop(columns=numerical_columns)
X_test_nonnumerical = X_test.drop(columns=numerical_columns)
display(X_train_nonnumerical.describe())
display(X_test_nonnumerical.describe())

X_train_numerical = X_train[numerical_columns]
X_test_numerical = X_test[numerical_columns]
display(X_train_numerical.describe())
display(X_test_numerical.describe())

<h3><u>Feature Engineering (SOM)</u></h3>

In [None]:
som = SOM(m=5, n=5, dim=X_train_numerical.shape[1], random_state=42)
som.fit(X_train_numerical.values)
som_train_predictions = som.predict(X_train_numerical.values)
som_test_predictions = som.predict(X_test_numerical.values)
print(som_train_predictions)
print(som_test_predictions)

[14 16  3 17 23  9 19  8 18 23  1  6  4  5 18 15  6  4  8 23  8 13 10  4
  4 24 23 24 10 14 24 12  7 15  9 14  0 13  6  4  4  5  6 21 13 17  6 18
  6 16  2 22 10 23 16  8  0 17  8 21 22 18  0 21 18  0  6 21 21  4  7  8
  6  5  9 10 14 15  8  0  4  9 19  1 13 23 19  9 24 19 16 15 15 23  4  8
  8  5  9 14  7  4 15  5 17  4 16 14  4 13 15 16  0 14 18 14  4 18 20 14
  5  9 23 13 18 23  6 24 13 21 23 16  5 14 24  9 17 14  4 16 17 24 10 18
  3 23 21 23  4  5  9  6  3 14  3  6 10  4  3  1 14  7 21 23 18 12  6 24
  9  8  5  8 16 13 16  3  7  3  0 21 22  8 16 14  9  4 21 21  4  5 10 17
  0  2 17 13  8 19 19 14 19  0 12 14 19  4  4 18  5 24  7  4  6  2 13  6
 17  6 17  5 14  8 19 20  3 13 16 16  2  0  4 10  4  9 21 22  6  5  9  0
 17 20 21  8  3 22 11  4 19  3  4  8  5 15 23 22 19 13 18  3 17 22 23 15
  8 15]
[15  8 24  6 24 14  4  0  1 23 14 19 13  5 19 10 21  5 17 19 12  6  4 24
  0 21  9 14  6  4 21  4 19 22  4 13 17 23  5 21 16  5 10 23 16 22  3 18
 23  8  6  4 14  0 23  3 15 19 23 12  4  4 

In [None]:
X_train_nonnumerical_som = pd.DataFrame(np.append(X_train_nonnumerical, som_train_predictions.reshape(-1, 1), axis=1), columns=X_train_nonnumerical.columns.tolist() + ['SOM Category'])
X_test_nonnumerical_som = pd.DataFrame(np.append(X_test_nonnumerical, som_test_predictions.reshape(-1, 1), axis=1), columns=X_train_nonnumerical.columns.tolist() + ['SOM Category'])


display(X_train_nonnumerical_som.shape)
display(X_test_nonnumerical_som.shape)
display(X_train_nonnumerical_som.columns.tolist())
display(X_test_nonnumerical_som.columns.tolist())
display(X_train_nonnumerical_som.head(5))
display(X_test_nonnumerical_som.head(5))
preprocessor = make_column_transformer(
    (OneHotEncoder(), X_train_nonnumerical_som.columns.tolist()),
    remainder='passthrough'
)
X_train_processed = preprocessor.fit_transform(X_train_nonnumerical_som)
X_test_processed = preprocessor.transform(X_test_nonnumerical_som)
display(X_train_processed.shape)
display(X_test_processed.shape)


(266, 3)

(67, 3)

['island', 'sex', 'SOM Category']

['island', 'sex', 'SOM Category']

Unnamed: 0,island,sex,SOM Category
0,Biscoe,FEMALE,14
1,Dream,FEMALE,16
2,Biscoe,MALE,3
3,Biscoe,FEMALE,17
4,Biscoe,MALE,23


Unnamed: 0,island,sex,SOM Category
0,Dream,FEMALE,15
1,Biscoe,FEMALE,8
2,Torgersen,MALE,24
3,Dream,FEMALE,6
4,Biscoe,MALE,24


(266, 30)

(67, 30)