In [16]:
import pandas as pd
import math
import numpy as np
# Imported packages/modules.

In [17]:
# Read CSVs and drop the extra index column saved in the files.
petal_df = pd.read_csv("Petal_Data.csv", index_col=0)
sepal_df = pd.read_csv("Sepal_Data.csv", index_col=0)
sepal_df.head()
# Loading in the csv files and displaying the head of either to review the data.

Unnamed: 0,sample_id,species,sepal_length,sepal_width
0,kzkn5q6TioixrLl,virginica,6.478159,3.035585
1,YkBvRsNxkaMuaqN,versicolor,5.520372,3.006424
2,RCpoVUsSf1lcnsx,virginica,5.609393,3.173603
3,bRJfblUTBih4vXZ,virginica,7.042214,2.808181
4,7eMPKj06G5OvZMj,virginica,6.996548,3.02257


In [18]:
# Quick column check to diagnose KeyError issues.
print("Petal columns:", petal_df.columns.tolist())
print("Sepal columns:", sepal_df.columns.tolist())

Petal columns: ['sample_id', 'species', 'petal_length', 'petal_width']
Sepal columns: ['sample_id', 'species', 'sepal_length', 'sepal_width']


In [19]:
# Combine into a single dataframe for the specific columns.
combined_df = pd.merge(
    petal_df,
    sepal_df.drop(columns=["species"]),
    on="sample_id",
    how="inner",
)
combined_df = combined_df[[
    "sample_id",
    "species",
    "petal_length",
    "petal_width",
    "sepal_length",
    "sepal_width",
]]

combined_df.head(10)

Unnamed: 0,sample_id,species,petal_length,petal_width,sepal_length,sepal_width
0,g4JZlAtiN4mBGyE,versicolor,4.589243,1.306969,5.844006,3.341825
1,BwWealRi00L4RH1,versicolor,4.836183,1.384244,5.919572,2.624173
2,ZdyiDoTAArjRj3X,virginica,5.51028,1.786967,6.081146,3.040232
3,0NqN8AaLZ8jelu^,setosa,1.434607,0.419694,4.036309,3.52745
4,8nqRlCsXKJjHafD,versicolor,3.95031,1.224275,6.885232,3.631972
5,otVBvvcynjp7UzN,setosa,1.506729,0.325962,5.401327,3.720574
6,fWIrmBqHQtz8xsF,virginica,5.604368,2.157243,6.456809,2.805747
7,ARwpVhHYWHt9wfY,setosa,1.614061,0.199143,4.903286,3.145444
8,F5F0rlPyu#7r6qZ,setosa,1.702897,0.095686,5.192556,3.526238
9,yi19vtGTuYKj4eZ,versicolor,4.268392,1.274064,6.822197,3.14183


In [20]:
# Used to see the 3 unique species names that are used.
combined_df['species'].unique()

array(['versicolor', 'virginica', 'setosa'], dtype=object)

In [38]:
# Correlation matrices for each species (numeric columns only).
numeric_cols = ["petal_length", "petal_width", "sepal_length", "sepal_width"]
corr_by_species = combined_df.groupby("species")[numeric_cols].corr()
corr_by_species

Unnamed: 0_level_0,Unnamed: 1_level_0,petal_length,petal_width,sepal_length,sepal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
setosa,petal_length,1.0,0.219993,-0.003592,-0.290102
setosa,petal_width,0.219993,1.0,0.091356,0.036456
setosa,sepal_length,-0.003592,0.091356,1.0,0.117963
setosa,sepal_width,-0.290102,0.036456,0.117963,1.0
versicolor,petal_length,1.0,-0.241938,0.007745,-0.063534
versicolor,petal_width,-0.241938,1.0,-0.021792,0.06938
versicolor,sepal_length,0.007745,-0.021792,1.0,0.155452
versicolor,sepal_width,-0.063534,0.06938,0.155452,1.0
virginica,petal_length,1.0,0.291776,-0.024689,0.027446
virginica,petal_width,0.291776,1.0,-0.116856,0.042958


In [27]:
# Average of each variable by species.
avg_cols = ["petal_length", "petal_width", "sepal_length", "sepal_width"]
avg_by_species = combined_df.groupby("species")[avg_cols].mean()
avg_by_species

Unnamed: 0_level_0,petal_length,petal_width,sepal_length,sepal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,1.486527,0.250725,4.999635,3.475225
versicolor,4.29069,1.345846,5.942616,2.76161
virginica,5.475172,1.988239,6.354953,3.001511


In [28]:
# Median of each variable by species.
med_cols = ["petal_length", "petal_width", "sepal_length", "sepal_width"]
med_by_species = combined_df.groupby("species")[med_cols].median()
med_by_species

Unnamed: 0_level_0,petal_length,petal_width,sepal_length,sepal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,1.464998,0.251564,4.977019,3.460304
versicolor,4.32191,1.372359,5.917317,2.747044
virginica,5.512922,1.954197,6.277034,3.019348


In [29]:
# Standard Deviation (std) of each variable by species.
std_cols = ["petal_length", "petal_width", "sepal_length", "sepal_width"]
std_by_species = combined_df.groupby("species")[std_cols].std()
std_by_species

Unnamed: 0_level_0,petal_length,petal_width,sepal_length,sepal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,0.207064,0.116201,0.32313,0.365322
versicolor,0.465222,0.185877,0.475391,0.385754
virginica,0.458861,0.242253,0.573233,0.325909
