In [13]:
### Perform a t-test to compare the sepal length iris species. 
## Calculate a 95% confidence interval for the petal length of the setosa species.
## Compare the main sepal width between versicolor and virginica species using a hypothesis
## Compare the correlation matrix between all pairs of freatures and visualize it. 

In [14]:
## First things first - import the libraries 

import pandas as pd # Tables (DataFrame) 
import numpy as np # basic math 
from scipy import stats # t-test & t critical value
from sklearn.datasets import load_iris # Iris Dataset
import matplotlib.pyplot as plt # data visualization

In [15]:
### Load the data 

iris = load_iris()
df = pd.DataFrame(iris.data, columns = ["sepal_length", "sepal_width", "petal_length", "petal_width"])
df["species"] = pd.Categorical.from_codes(iris.target,iris.target_names) # add species names 

In [16]:
### T-test - compare sepal length between 2 species (Pick any two species setosa vs Versicolor) 

a = df.loc[df["species"]=="setosa", "sepal_length"] # Group A values 
b = df.loc[df["species"]=="versicolor", "sepal_length"] # basic B values 
t_stat, p_val = stats.ttest_ind(a, b, equal_var=True) # basic t-test with equal variance
print("Sepal length - setosa vs versicolor t-test")
print(" t=", round(t_stat,3), " p=", p_val) 

Sepal length - setosa vs versicolor t-test
 t= -10.521  p= 8.985235037487079e-18


In [17]:
### 95% CI for petal length of setosa 

x = df.loc[df["species"]=="setosa", "petal_length"] # data for CI 
n = x.size # Sample size 
m = x.mean() # mean 
sd = x.std(ddof=1) # standard deviation 
se = sd / np.sqrt(n) # Standard error
t_stat = stats.t.ppf(0.975, df=n-1) # 95% two sided critical value 
ci_low, ci_high = m - t_stat*se, m + t_stat*se 
print("Setosa petal length - 95% CI: ", (round(ci_low, 3), round(ci_high), 3))

Setosa petal length - 95% CI:  (1.413, 2, 3)


In [21]:
### Compare the mean sepal width between versicolor and virginica species using a hypothesis test. 
# Another t-test

v1 = df.loc[df["species"]=="versicolor", "sepal_width"] # Group 1 
v2 = df.loc[df["species"]=="virginica", "sepal_width"] # Group 2
t_sw, p_sw = stats.ttest_ind(v1, v2, equal_var=True)
print("Sepal width - versicolor vs virginica t-test")
print(" t=", round(t_sw, 3), " p", p_sw)

Sepal width - versicolor vs virginica t-test
 t= -3.206  p 0.0018191004238894803


In [22]:
### Compute the correlation matrix between all pairs of features and visualize it.

cols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
corr = df[cols].corr(numeric_only=True) # Pearson correlation 
print("\nCorrelation matrix:\n", corr.round(3))


Correlation matrix:
               sepal_length  sepal_width  petal_length  petal_width
sepal_length         1.000       -0.118         0.872        0.818
sepal_width         -0.118        1.000        -0.428       -0.366
petal_length         0.872       -0.428         1.000        0.963
petal_width          0.818       -0.366         0.963        1.000


In [None]:
### Visualization (heat map) 

plt.figure(figsize=(5,4))
plt.imshow(corr, interpolation="nearest") # draw matrix 
plt.colorbar() # adds color 
plt.title("Iris Feature Correlation (Pearson)")
plt.xticks(range(len(cols)), cols, rotation=45, ha="right") 
plt.yticks(range(len(cols)), cols)
plt