In [1]:
# Cell 1: Import required libraries
import pandas as pd               # for DataFrame operations
import numpy as np                # for numeric computations
import seaborn as sns             # to load built‑in datasets
import matplotlib.pyplot as plt   # for any quick plots (not strictly needed here)


In [2]:
# Cell 2: Part 1 – Load an open‑source dataset (Tips)
# Source: seaborn’s built‑in “tips” dataset (based on a restaurant’s tips data)
df_tips = sns.load_dataset('tips')
df_tips.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
# Cell 3: Summary statistics of numeric vars grouped by a categorical var
# Here: group 'total_bill' and 'tip' by the day of week
grouped = (
    df_tips
      .groupby('day')[['total_bill','tip']]
      .agg(['mean','median','min','max','std'])
)
grouped


  .groupby('day')[['total_bill','tip']]


Unnamed: 0_level_0,total_bill,total_bill,total_bill,total_bill,total_bill,tip,tip,tip,tip,tip
Unnamed: 0_level_1,mean,median,min,max,std,mean,median,min,max,std
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Thur,17.682742,16.2,7.51,43.11,7.88617,2.771452,2.305,1.25,6.7,1.240223
Fri,17.151579,15.38,5.75,40.17,8.30266,2.734737,3.0,1.0,4.73,1.019577
Sat,20.441379,18.24,3.07,50.81,9.480419,2.993103,2.75,1.0,10.0,1.631014
Sun,21.41,19.63,7.25,48.17,8.832122,3.255132,3.15,1.01,6.5,1.23488


In [4]:
# Cell 4: Create a numeric list for each response of the categorical variable
# We’ll encode 'day' → integer codes and collect into a Python list
day_codes = df_tips['day'].astype('category').cat.codes.tolist()
print(day_codes[10:30])   


[3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


##  Part 2 – Load the Iris dataset and display descriptive stats per species


In [5]:

# Source: local 'Iris.csv' downloaded from Kaggle (https://www.kaggle.com/datasets/uciml/iris)
df_iris = pd.read_csv("Datasets/Iris.csv")

# We’ll iterate through each of the three species
for species in df_iris['Species'].unique():
    sub = df_iris[df_iris['Species'] == species]
    print(f"\n=== {species} ===")
    # 1) Basic describe() (includes count, mean, std, min, 25/50/75% percentiles, max)
    print(sub.describe())
    # 2) Additional percentiles (10th, 90th)
    p10 = sub.iloc[:, 1:5].quantile(0.10)
    p90 = sub.iloc[:, 1:5].quantile(0.90)

print("\n10th percentiles:\n", p10)
print("90th percentiles:\n", p90)



=== Iris-setosa ===
             Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count  50.00000       50.00000     50.000000      50.000000      50.00000
mean   25.50000        5.00600      3.418000       1.464000       0.24400
std    14.57738        0.35249      0.381024       0.173511       0.10721
min     1.00000        4.30000      2.300000       1.000000       0.10000
25%    13.25000        4.80000      3.125000       1.400000       0.20000
50%    25.50000        5.00000      3.400000       1.500000       0.20000
75%    37.75000        5.20000      3.675000       1.575000       0.30000
max    50.00000        5.80000      4.400000       1.900000       0.60000

=== Iris-versicolor ===
              Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count   50.00000      50.000000     50.000000      50.000000     50.000000
mean    75.50000       5.936000      2.770000       4.260000      1.326000
std     14.57738       0.516171      0.313798       0.469911   

Thought for a second


Below is a **line‑by‑line walkthrough** of the corrected notebook for **Experiment 3: Descriptive Statistics**, including the fix for the percentile unpacking error.

---

## Cell 1: Imports

```python
import pandas as pd               # pandas: for DataFrame operations, grouping, stats
import numpy as np                # NumPy: numerical computations (arrays, math)
import seaborn as sns             # Seaborn: to load built‑in datasets easily
import matplotlib.pyplot as plt   # Matplotlib: for any plots (not strictly used below)
```

1. **`import pandas as pd`**
   Brings in the pandas library under the alias `pd`, enabling you to work with tables (DataFrames), perform group‐by operations, compute summary stats, etc.
2. **`import numpy as np`**
   Brings in NumPy under `np`, which you’ll need if you want to do any explicit numeric calculations (not heavily used here, but a best practice).
3. **`import seaborn as sns`**
   Imports Seaborn under `sns`; we’ll use it to load the “tips” dataset without having to download files manually.
4. **`import matplotlib.pyplot as plt`**
   Imports Matplotlib’s `pyplot` API under `plt`—useful if you later decide to draw histograms or boxplots of your grouped data.

---

## Cell 2: Load and inspect the “Tips” dataset

```python
df_tips = sns.load_dataset('tips')
df_tips.head()
```

1. **`sns.load_dataset('tips')`**
   Fetches Seaborn’s built‐in “tips” dataset (244 rows, 7 columns) directly into a pandas DataFrame. Columns include:

   * `total_bill` (float)
   * `tip` (float)
   * `sex`, `smoker`, `day`, `time` (categorical strings)
   * `size` (int)
2. **`df_tips.head()`**
   Displays the first five rows so you can verify you’ve loaded the data correctly and see the column names and types.

---

## Cell 3: Summary statistics grouped by a categorical variable

```python
grouped = (
    df_tips
      .groupby('day')[['total_bill','tip']]
      .agg(['mean','median','min','max','std'])
)
grouped
```

1. **`df_tips.groupby('day')`**
   Splits the DataFrame into sub‐DataFrames, one per unique value of the `day` column (Thursday, Friday, Saturday, Sunday).
2. **`[['total_bill','tip']]`**
   From each sub‐DataFrame, select only the `total_bill` and `tip` columns for analysis.
3. **`.agg(['mean','median','min','max','std'])`**
   For each selected column within each day, compute five aggregate statistics:

   * **mean** (average)
   * **median** (50th percentile)
   * **min** (smallest value)
   * **max** (largest value)
   * **std** (standard deviation)
4. **Assignment to `grouped`**
   Stores the resulting multi‐indexed DataFrame (days down the rows, stats across the columns) in the variable `grouped`.
5. **`grouped` (last line)**
   Displays that aggregated table in your notebook output.

---

## Cell 4: Encode the categorical variable into numeric codes

```python
day_codes = df_tips['day'].astype('category').cat.codes.tolist()
print(day_codes[:20], '…')
```

1. **`df_tips['day'].astype('category')`**
   Casts the `day` column to pandas’ categorical dtype, which internally assigns each unique label an integer code.
2. **`.cat.codes`**
   Extracts those integer codes as a new Series of the same length (244). For example, if categories sorted alphabetically are `['Thur','Fri','Sat','Sun']`, then `'Thur'→0, 'Fri'→1, 'Sat'→2, 'Sun'→3`.
3. **`.tolist()`**
   Converts that Series of integer codes into a regular Python list, one code per row.
4. **`print(day_codes[:20], '…')`**
   Prints the first 20 numeric codes to give you a sense of the mapping; the trailing ellipsis indicates there are many more.

---

## Cell 5: Descriptive stats for each Iris species (fixed percentile code)

```python
df_iris = pd.read_csv("Iris.csv")   # Load the Iris data from a local CSV

for species in df_iris['Species'].unique():   # Loop over each species name
    sub = df_iris[df_iris['Species'] == species]  # Filter to that species
    print(f"\n=== {species} ===")                # Header for clarity

    # 1) Default describe() summary
    print(sub.describe())  

    # 2) Compute 10th percentile and 90th percentile separately
    p10 = sub.iloc[:, 1:5].quantile(0.10)
    p90 = sub.iloc[:, 1:5].quantile(0.90)

    # Print those percentiles
    print("\n10th percentiles:\n", p10)
    print("90th percentiles:\n", p90)
```

1. **`pd.read_csv("Iris.csv")`**
   Reads the local `Iris.csv` (150 rows, 6 columns including an `Id` column). Make sure the file sits in your notebook’s working directory.
2. **`df_iris['Species'].unique()`**
   Returns the array of unique species names: `['Iris-setosa','Iris-versicolor','Iris-virginica']`.
3. **`for species in …:`**
   Begins a loop that will process one species at a time.
4. **`sub = df_iris[df_iris['Species'] == species]`**
   Boolean‐indexing to extract only rows belonging to the current species, stored in `sub`.
5. **`print(f"\n=== {species} ===")`**
   Prints a header to separate each species’ output block.
6. **`sub.describe()`**
   Built‑in pandas method that shows count, mean, std, min, 25%, 50%, 75%, and max for every numeric column (`SepalLengthCm`, `SepalWidthCm`, `PetalLengthCm`, `PetalWidthCm`).
7. **`sub.iloc[:, 1:5]`**
   Selects columns by position: skip the first (`Id`) and last (`Species`), keeping only the four measurement columns.
8. **`.quantile(0.10)`**
   Computes the 10th percentile for each of those four columns, returning a Series.
9. **`.quantile(0.90)`**
   Computes the 90th percentile likewise.
10. **`print("\n10th percentiles:\n", p10)`**
    Displays the 10th‑percentile Series with index labels matching the four columns.
11. **`print("90th percentiles:\n", p90)`**
    Displays the 90th‑percentile Series.

---

## Cell 6: Recap of actions

```python
print("""
Part 1 – Tips dataset:
 • Loaded 244 records with 7 columns.
 • Grouped 'total_bill' & 'tip' by 'day' to get mean, median, min, max, std.
 • Encoded 'day' as integer codes and stored in a list of length 244.

Part 2 – Iris dataset:
 • Loaded 150 records of iris measurements.
 • For each species, displayed describe() (including quartiles) plus 
   the 10th and 90th percentiles.
""")
```

* Simply prints a **multi‐line summary** reminding you (and your examiners) exactly what you did in each part:

  1. **Part 1** with the Tips dataset: grouping stats and categorical encoding.
  2. **Part 2** with Iris: descriptive summaries and extended percentiles.

---

With this, every line of code in your **Descriptive Statistics** notebook is explained. You can confidently describe each import, transformation, grouping, and calculation in your viva. Good luck!


In [6]:
df_tips.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [7]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

# Load iris dataset
iris_data = load_iris(as_frame=True)
df = iris_data.frame

# Add species column
df['species'] = pd.Categorical.from_codes(iris_data.target, iris_data.target_names)

# Columns to summarize
numeric_columns = ['sepal length (cm)', 'sepal width (cm)', 
                   'petal length (cm)', 'petal width (cm)']

# Grouped summary statistics
for col in numeric_columns:
    print(f"\nSummary for: {col}")
    grouped = df.groupby('species')[col]
    print("Mean:\n", grouped.mean())
    print("Median:\n", grouped.median())
    print("Min:\n", grouped.min())
    print("Max:\n", grouped.max())
    print("Standard Deviation:\n", grouped.std())

# Create a dictionary with list of values per species
species_data = {species: df[df['species'] == species]['sepal length (cm)'].tolist()
                for species in df['species'].unique()}

print("\nList of Sepal Lengths by Species:\n", species_data)



Summary for: sepal length (cm)
Mean:
 species
setosa        5.006
versicolor    5.936
virginica     6.588
Name: sepal length (cm), dtype: float64
Median:
 species
setosa        5.0
versicolor    5.9
virginica     6.5
Name: sepal length (cm), dtype: float64
Min:
 species
setosa        4.3
versicolor    4.9
virginica     4.9
Name: sepal length (cm), dtype: float64
Max:
 species
setosa        5.8
versicolor    7.0
virginica     7.9
Name: sepal length (cm), dtype: float64
Standard Deviation:
 species
setosa        0.352490
versicolor    0.516171
virginica     0.635880
Name: sepal length (cm), dtype: float64

Summary for: sepal width (cm)
Mean:
 species
setosa        3.428
versicolor    2.770
virginica     2.974
Name: sepal width (cm), dtype: float64
Median:
 species
setosa        3.4
versicolor    2.8
virginica     3.0
Name: sepal width (cm), dtype: float64
Min:
 species
setosa        2.3
versicolor    2.0
virginica     2.2
Name: sepal width (cm), dtype: float64
Max:
 species
setosa      

  grouped = df.groupby('species')[col]


In [8]:
def basic_stats_by_species(df, species_name):
    species_df = df[df['species'] == species_name]
    print(f"\n🔍 Statistics for: {species_name}")
    for col in numeric_columns:
        values = species_df[col]
        print(f"\n{col}")
        print("Mean:", values.mean())
        print("Standard Deviation:", values.std())
        print("25th Percentile:", np.percentile(values, 25))
        print("50th Percentile (Median):", np.percentile(values, 50))
        print("75th Percentile:", np.percentile(values, 75))
        print("Min:", values.min())
        print("Max:", values.max())

# Call function for each species
for species in df['species'].unique():
    basic_stats_by_species(df, species)



🔍 Statistics for: setosa

sepal length (cm)
Mean: 5.006
Standard Deviation: 0.35248968721345136
25th Percentile: 4.8
50th Percentile (Median): 5.0
75th Percentile: 5.2
Min: 4.3
Max: 5.8

sepal width (cm)
Mean: 3.428
Standard Deviation: 0.3790643690962887
25th Percentile: 3.2
50th Percentile (Median): 3.4
75th Percentile: 3.6750000000000003
Min: 2.3
Max: 4.4

petal length (cm)
Mean: 1.4620000000000002
Standard Deviation: 0.17366399648018407
25th Percentile: 1.4
50th Percentile (Median): 1.5
75th Percentile: 1.5750000000000002
Min: 1.0
Max: 1.9

petal width (cm)
Mean: 0.24599999999999997
Standard Deviation: 0.10538558938004565
25th Percentile: 0.2
50th Percentile (Median): 0.2
75th Percentile: 0.3
Min: 0.1
Max: 0.6

🔍 Statistics for: versicolor

sepal length (cm)
Mean: 5.936
Standard Deviation: 0.5161711470638634
25th Percentile: 5.6
50th Percentile (Median): 5.9
75th Percentile: 6.3
Min: 4.9
Max: 7.0

sepal width (cm)
Mean: 2.7700000000000005
Standard Deviation: 0.3137983233784114
25th