In [52]:
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import silhouette_score, pair_confusion_matrix, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, davies_bouldin_score, calinski_harabasz_score, precision_score, recall_score, PrecisionRecallDisplay, make_scorer
from sklearn.model_selection import train_test_split, validation_curve, GridSearchCV, learning_curve, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn_som.som import SOM
from permetrics import ClusteringMetric

plt.style.use('default')
              

# %run DataCleaningFunctions.ipynb

In [53]:
data = pd.read_csv("https://raw.githubusercontent.com/cdavidshaffer/CPSC4970-AI/master/data/penguins.csv")
data.info()
data.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   species            344 non-null    object
 1   island             344 non-null    object
 2   culmen_length_mm   344 non-null    object
 3   culmen_depth_mm    344 non-null    object
 4   flipper_length_mm  344 non-null    object
 5   body_mass_g        344 non-null    object
 6   sex                344 non-null    object
dtypes: object(7)
memory usage: 18.9+ KB


Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,?,?,?,?,?
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,FEMALE
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,MALE
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,?
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,?


<h2><u>Data Cleaning</u></h2>

In [54]:
# check text column unique values
textual_columns = ['species', 'island', 'sex']
for column in textual_columns:
        column_values = data[column].unique()
        print(f"{column.title()} Values: ", *column_values)

Species Values:  Adelie Chinstrap Gentoo
Island Values:  Torgersen Biscoe Dream
Sex Values:  MALE FEMALE ? _


In [55]:
# Replace invalid text entries
replace_characters = ['?', '_']

for char in replace_characters:
    data['sex'].replace(char, None, inplace=True)
    
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   species            344 non-null    object
 1   island             344 non-null    object
 2   culmen_length_mm   344 non-null    object
 3   culmen_depth_mm    344 non-null    object
 4   flipper_length_mm  344 non-null    object
 5   body_mass_g        344 non-null    object
 6   sex                333 non-null    object
dtypes: object(7)
memory usage: 18.9+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['sex'].replace(char, None, inplace=True)


In [56]:
# convert text columns to string type
for i in textual_columns:
    data[i] = data[i].astype('string')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   species            344 non-null    string
 1   island             344 non-null    string
 2   culmen_length_mm   344 non-null    object
 3   culmen_depth_mm    344 non-null    object
 4   flipper_length_mm  344 non-null    object
 5   body_mass_g        344 non-null    object
 6   sex                333 non-null    string
dtypes: object(4), string(3)
memory usage: 18.9+ KB


In [57]:
# clean numerical columns from non-numerical values
numerical_columns = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']
for column in numerical_columns:
        data[column] = pd.to_numeric(data[column], errors='coerce')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    string 
 1   island             344 non-null    string 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    string 
dtypes: float64(4), string(3)
memory usage: 18.9 KB


In [58]:
# Drop all rows with missing values
data = data.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 333 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    string 
 1   island             333 non-null    string 
 2   culmen_length_mm   333 non-null    float64
 3   culmen_depth_mm    333 non-null    float64
 4   flipper_length_mm  333 non-null    float64
 5   body_mass_g        333 non-null    float64
 6   sex                333 non-null    string 
dtypes: float64(4), string(3)
memory usage: 20.8 KB


In [59]:
categories = []
for column in textual_columns:
    categories.append(data[column].unique())

In [60]:
X = data.drop("species", axis=1)
y = data["species"]

In [61]:
total = len(y)
print(f"Total: {total}")
penguins = y.unique()
for penguin in penguins:
    species = y[y == penguin]
    count = species.value_counts().sum()
    species_fraction = count / total
    print(f"Number of {penguin}'s: ", count)
    print(f"Fraction of {penguin}'s: ", "{:.2f}%".format(species_fraction * 100))

Total: 333
Number of Adelie's:  146
Fraction of Adelie's:  43.84%
Number of Chinstrap's:  68
Fraction of Chinstrap's:  20.42%
Number of Gentoo's:  119
Fraction of Gentoo's:  35.74%


<h3><u>Data Preparation</u></h3>

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
display(X_train.shape) 
display(X_test.shape)
display(y_train.shape)
display(y_test.shape)

(266, 6)

(67, 6)

(266,)

(67,)

In [63]:
clf_metrics = {'Accuracy': 'accuracy', 'Precision (Micro)': 'precision_micro', 'Recall (Micro)': 'recall_micro'}
vc_dict = {}
lc_dict = {}
cvs_dict = {}

In [64]:
train_labels = y_train.unique()
test_labels = y_test.unique()

In [65]:
pd.set_option('display.max_columns', None)

In [66]:
numerical_cs = make_column_selector(dtype_include=np.number)
nonnumerical_cs = make_column_selector(dtype_exclude=np.number)
ct = ColumnTransformer(
    [("ss", StandardScaler(), numerical_cs),
     ("ohe", OneHotEncoder(handle_unknown='ignore'), nonnumerical_cs)]
)