In [1]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
import seaborn as sns

In [2]:
df = pd.read_csv('animals-dataset.csv')

In [5]:
df.head(15)

Unnamed: 0,Region,Stage,Individual ID,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,Anvers,"Adult, 1 Egg Stage",N1A1,39.1,18.7,181.0,3750.0,,,Not enough blood for isotopes.
1,Anvers,"Adult, 1 Egg Stage",N1A2,39.5,17.4,186.0,3800.0,8.94956,-24.69454,
2,Anvers,"Adult, 1 Egg Stage",N2A1,40.3,18.0,195.0,3250.0,8.36821,-25.33302,
3,Anvers,"Adult, 1 Egg Stage",N2A2,,,,,,,Adult not sampled.
4,Anvers,"Adult, 1 Egg Stage",N3A1,36.7,19.3,193.0,3450.0,8.76651,-25.32426,
5,Anvers,"Adult, 1 Egg Stage",N3A2,39.3,20.6,190.0,3650.0,8.66496,-25.29805,
6,Anvers,"Adult, 1 Egg Stage",N4A1,38.9,17.8,181.0,3625.0,9.18718,-25.21799,Nest never observed with full clutch.
7,Anvers,"Adult, 1 Egg Stage",N4A2,39.2,19.6,195.0,4675.0,9.4606,-24.89958,Nest never observed with full clutch.
8,Anvers,"Adult, 1 Egg Stage",N5A1,34.1,18.1,193.0,3475.0,,,No blood sample obtained.
9,Anvers,"Adult, 1 Egg Stage",N5A2,42.0,20.2,190.0,4250.0,9.13362,-25.09368,No blood sample obtained for sexing.


In [11]:
print(df['Region'].unique())
print(df['Stage'].unique())
print(df['Individual ID'].unique())
print(df['Culmen Length (mm)'].unique())

['Anvers']
['Adult, 1 Egg Stage']
['N1A1' 'N1A2' 'N2A1' 'N2A2' 'N3A1' 'N3A2' 'N4A1' 'N4A2' 'N5A1' 'N5A2'
 'N6A1' 'N6A2' 'N7A1' 'N7A2' 'N8A1' 'N8A2' 'N9A1' 'N9A2' 'N10A1' 'N10A2'
 'N11A1' 'N11A2' 'N12A1' 'N12A2' 'N13A1' 'N13A2' 'N17A1' 'N17A2' 'N18A1'
 'N18A2' 'N21A1' 'N21A2' 'N22A1' 'N22A2' 'N23A1' 'N23A2' 'N24A1' 'N24A2'
 'N25A1' 'N25A2' 'N26A1' 'N26A2' 'N27A1' 'N27A2' 'N28A1' 'N28A2' 'N29A1'
 'N29A2' 'N30A1' 'N30A2' 'N32A1' 'N32A2' 'N34A1' 'N34A2' 'N35A1' 'N35A2'
 'N36A1' 'N36A2' 'N37A1' 'N37A2' 'N38A1' 'N38A2' 'N39A1' 'N39A2' 'N40A1'
 'N40A2' 'N41A1' 'N41A2' 'N42A1' 'N42A2' 'N44A1' 'N44A2' 'N45A1' 'N45A2'
 'N46A1' 'N46A2' 'N48A1' 'N48A2' 'N49A1' 'N49A2' 'N50A1' 'N50A2' 'N47A1'
 'N47A2' 'N51A1' 'N51A2' 'N53A1' 'N53A2' 'N55A1' 'N55A2' 'N58A1' 'N58A2'
 'N60A1' 'N60A2' 'N61A1' 'N61A2' 'N63A1' 'N63A2' 'N64A1' 'N64A2' 'N66A1'
 'N66A2' 'N67A1' 'N67A2' 'N69A1' 'N69A2' 'N71A1' 'N71A2' 'N72A1' 'N72A2'
 'N73A1' 'N73A2' 'N76A1' 'N76A2' 'N77A1' 'N77A2' 'N78A1' 'N78A2' 'N79A1'
 'N79A2' 'N80A1' 'N

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Region               344 non-null    object 
 1   Stage                344 non-null    object 
 2   Individual ID        344 non-null    object 
 3   Culmen Length (mm)   342 non-null    float64
 4   Culmen Depth (mm)    342 non-null    float64
 5   Flipper Length (mm)  342 non-null    float64
 6   Body Mass (g)        342 non-null    float64
 7   Delta 15 N (o/oo)    330 non-null    float64
 8   Delta 13 C (o/oo)    331 non-null    float64
 9   Comments             26 non-null     object 
dtypes: float64(6), object(4)
memory usage: 27.0+ KB


In [13]:
df.shape

(344, 10)

In [14]:
df.isna().sum()

Region                   0
Stage                    0
Individual ID            0
Culmen Length (mm)       2
Culmen Depth (mm)        2
Flipper Length (mm)      2
Body Mass (g)            2
Delta 15 N (o/oo)       14
Delta 13 C (o/oo)       13
Comments               318
dtype: int64

In [15]:
#Dropping columns that have categorical data
df.drop(columns=['Region', 'Stage', 'Individual ID', 'Comments'], inplace=True)

In [16]:
#Dropping columns that have null values
df.dropna(inplace=True)

In [18]:
df.head()

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo)
1,39.5,17.4,186.0,3800.0,8.94956,-24.69454
2,40.3,18.0,195.0,3250.0,8.36821,-25.33302
4,36.7,19.3,193.0,3450.0,8.76651,-25.32426
5,39.3,20.6,190.0,3650.0,8.66496,-25.29805
6,38.9,17.8,181.0,3625.0,9.18718,-25.21799


In [21]:
print(df['Culmen Length (mm)'].unique())

[39.5 40.3 36.7 39.3 38.9 39.2 42.  37.8 34.6 38.7 42.5 34.4 46.  37.7
 35.9 38.2 38.8 35.3 40.6 40.5 37.9 37.2 40.9 36.4 42.2 37.6 36.5 36.
 44.1 37.  39.6 42.3 40.1 35.  34.5 41.4 39.  35.7 41.3 41.1 41.6 35.5
 41.8 33.5 39.7 45.8 42.8 36.2 42.1 42.9 35.1 37.3 36.3 36.9 38.3 34.
 40.8 38.1 33.1 43.2 41.  38.6 45.6 42.7 40.2 35.2 41.5 38.5 43.1 36.8
 37.5 35.6 32.1 40.7 36.6 46.5 50.  51.3 45.4 52.7 45.2 46.1 46.6 51.7
 47.  52.  45.9 50.5 50.3 58.  46.4 49.2 42.4 48.5 50.6 46.7 49.5 52.8
 54.2 51.  49.7 47.5 47.6 46.9 53.5 49.  46.2 50.9 45.5 50.8 50.1 51.5
 49.8 48.1 51.4 45.7 50.7 52.2 49.3 50.2 46.8 55.8 43.5 49.6 48.7 43.3
 48.4 45.1 46.3 44.5 47.8 48.2 59.6 49.1 42.6 44.4 44.  45.3 43.6 44.9
 45.  43.8 50.4 54.3 47.7 48.6 51.1 52.5 47.4 43.4 52.1 49.4 55.9 47.2
 47.3 41.7 53.4 55.1 48.8 49.9]
