In [9]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

ROOT = Path(".").resolve()
DATA = ROOT / "data"
IMAGES = ROOT / "images"
IMAGES.mkdir(exist_ok=True)
print("Paths OK:", DATA.exists(), IMAGES.exists())

Paths OK: True True


In [10]:
# Read the CSV in this repo and set column names
df = pd.read_csv(DATA / "iris.csv", header=None)
df.columns = ["sepal_length","sepal_width","petal_length","petal_width","class"]

# quick sanity check
print(df.shape)
df.head()

(151, 5)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,sepal_length,sepal_width,petal_length,petal_width,class
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa


In [11]:
print("Shape:", df.shape)
print("\nClass counts:\n", df["class"].value_counts())
df.describe()

Shape: (151, 5)

Class counts:
 class
setosa        50
versicolor    50
virginica     50
class          1
Name: count, dtype: int64


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
count,151.0,151.0,151.0,151.0,151
unique,36.0,24.0,44.0,23.0,4
top,5.0,3.0,1.5,0.2,setosa
freq,10.0,26.0,13.0,29.0,50


In [13]:
# Cell 4 - Histograms
import matplotlib.pyplot as plt

numeric = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

for col in numeric:
    plt.figure()
    plt.hist(df[col], bins=20, color="skyblue", edgecolor="black")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.title(f"Histogram of {col}")
    plt.tight_layout()
    plt.savefig(IMAGES / f"hist_{col}.png")
    plt.close()

print("Histograms saved to:", IMAGES)

Histograms saved to: /Users/sophiegotch/Documents/pands-project/images


In [14]:
pairs = [("sepal_length","sepal_width"),
         ("sepal_length","petal_length"),
         ("sepal_length","petal_width"),
         ("sepal_width","petal_length"),
         ("sepal_width","petal_width"),
         ("petal_length","petal_width")]

for x, y in pairs:
    plt.figure()
    for cls, sub in df.groupby("class"):
        plt.scatter(sub[x], sub[y], s=20, alpha=0.8, label=cls)
    plt.xlabel(x); plt.ylabel(y)
    plt.title(f"{x} vs {y}")
    plt.legend()
    plt.tight_layout()
    plt.savefig(IMAGES / f"scatter_{x}_vs_{y}.png")
    plt.close()

print("Saved scatter plots to:", IMAGES)

Saved scatter plots to: /Users/sophiegotch/Documents/pands-project/images


In [15]:
out = ROOT / "summary.txt"
with open(out, "w") as f:
    f.write("Iris Dataset Summary\n")
    f.write("="*30 + "\n\n")
    f.write(str(df.describe()) + "\n\n")
    f.write("Class counts:\n")
    f.write(str(df["class"].value_counts()) + "\n")
print("Wrote", out)

Wrote /Users/sophiegotch/Documents/pands-project/summary.txt
