## SMAI Assignment - 1

### Q1.0 Dataset Generation

In [None]:
# Example: compute required seed from IIITH username
import hashlib, numpy as np

import numpy as np
import pandas as pd
import hashlib

class StudentDataset:
    """
    A class to generate and store student dataset with attributes:
    gender, major, program, and GPA.
    
    Attributes:
        num_students (int): Number of student records to generate.
        seed (int): Random seed for reproducibility.
        rng (np.random.Generator): Random number generator instance.
        df (pd.DataFrame): Generated dataset stored as a DataFrame.
    """

    def __init__(self, num_students: int, seed: int):
        """
        Initializes the dataset with the given number of students and seed.
        Generates the full dataset once during initialization.
        """
        self.num_students = num_students
        self.seed = seed
        self.rng = np.random.default_rng(seed)

        # Generate dataset
        self.df = self.assemble_dataframe()

    def get_full_dataframe(self) -> pd.DataFrame:
        """
        Returns the complete dataset as a pandas DataFrame.
        """
        return self.df

    def generate_gender(self) -> list[str]:
        """
        Generates gender distribution for students.
        Probabilities: Male (65%), Female (33%), Other (2%).
        """
        return self.rng.choice(
            ["Male", "Female", "Other"],
            size=self.num_students,
            p=[0.65, 0.33, 0.02]
        ).tolist()

    def generate_major(self) -> list[str]:
        """
        Generates major distribution for students.
        Probabilities: B.Tech (70%), MS (20%), PhD (10%).
        """
        return self.rng.choice(
            ["B.Tech", "MS", "PhD"],
            size=self.num_students,
            p=[0.70, 0.20, 0.10]
        ).tolist()

    def generate_program(self, majors: list[str]) -> list[str]:
        """
        Generates program distribution conditioned on major.
        """
        programs = []
        for major in majors:
            if major == "B.Tech":
                probs = [0.40, 0.40, 0.10, 0.10]
            elif major == "MS":
                probs = [0.30, 0.30, 0.20, 0.20]
            else:  # PhD
                probs = [0.25, 0.25, 0.25, 0.25]
            
            prog = self.rng.choice(
                ["CSE", "ECE", "CHD", "CND"],
                p=probs
            )
            programs.append(prog)
        return programs

    def generate_gpa(self, majors: list[str]) -> list[float]:
        """
        Generates GPA values based on major using normal distributions:
        - B.Tech: N(7.0, 1.0)
        - MS: N(8.0, 0.7)
        - PhD: N(8.3, 0.5)
        Values are clipped to [4.0, 10.0].
        """
        gpas = []
        for major in majors:
            if major == "B.Tech":
                gpa = self.rng.normal(7.0, 1.0)
            elif major == "MS":
                gpa = self.rng.normal(8.0, 0.7)
            else:  # PhD
                gpa = self.rng.normal(8.3, 0.5)

            # Clip GPA to range [4.0, 10.0]
            gpa = np.clip(gpa, 4.0, 10.0)
            gpas.append(round(gpa, 2))  # Round for readability
        return gpas

    def assemble_dataframe(self) -> pd.DataFrame:
        """
        Assembles the full dataset with gender, major, program, and GPA.
        """
        genders = self.generate_gender()
        majors = self.generate_major()
        programs = self.generate_program(majors)
        gpas = self.generate_gpa(majors)

        df = pd.DataFrame({
            "Gender": genders,
            "Major": majors,
            "Program": programs,
            "GPA": gpas
        })
        return df

username = "hiten.garg"   # <-- replace with the part before @
seed = int(hashlib.sha256(username.encode()).hexdigest(), 16) % (2**32)
# rng = np.random.default_rng(seed)

# Create dataset
dataset = StudentDataset(num_students=10000, seed=seed)

df = dataset.get_full_dataframe()
print(df.head())
print()
print(df.describe(include="all"))

### Q1.1 Dataset Analysis

#### (a) Visualizations

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def add_username_watermark():
    plt.text(
        0.95, 0.95, "hiten.garg",
        ha="right", va="top",
        transform=plt.gca().transAxes,
        fontsize=10, color="gray", alpha=0.7
    )

# --- Visualization methods ---
def plot_gender_distribution(self):
    plt.figure(figsize=(6,4))
    sns.countplot(x="Gender", data=self.df, order=["Male","Female","Other"])
    plt.title("Gender Distribution")
    add_username_watermark()
    plt.show()

def plot_major_distribution(self):
    plt.figure(figsize=(6,4))
    sns.countplot(x="Major", data=self.df, order=["B.Tech","MS","PhD"])
    plt.title("Major Distribution")
    add_username_watermark()
    plt.show()

def plot_program_distribution(self):
    plt.figure(figsize=(6,4))
    sns.countplot(x="Program", data=self.df, order=["CSE","ECE","CHD","CND"])
    plt.title("Program Distribution")
    add_username_watermark()
    plt.show()

def plot_gpa_distribution(self, bins=20):
    plt.figure(figsize=(8,4))
    sns.histplot(self.df["GPA"], bins=bins, kde=True)
    plt.title("GPA Distribution")
    plt.xlabel("GPA")
    plt.ylabel("Count")
    add_username_watermark()
    plt.show()

def plot_program_by_major(self):
    plt.figure(figsize=(8,5))
    sns.countplot(x="Major", hue="Program", data=self.df)
    plt.title("Program Distribution Conditioned on Major")
    add_username_watermark()
    plt.show()

def plot_gpa_by_major(self):
    plt.figure(figsize=(8,5))
    sns.boxplot(x="Major", y="GPA", data=self.df)
    plt.title("GPA by Major")
    add_username_watermark()
    plt.show()

def plot_gpa_by_program(self):
    plt.figure(figsize=(8,5))
    sns.boxplot(x="Program", y="GPA", data=self.df)
    plt.title("GPA by Program")
    add_username_watermark()
    plt.show()

def plot_gpa_by_program_and_major(self):
    plt.figure(figsize=(10,6))
    sns.boxplot(x="Program", y="GPA", hue="Major", data=self.df)
    plt.title("GPA by Program and Major")
    add_username_watermark()
    plt.show()

def plot_sampled_dataset(self, n=100):
    sample_df = self.df.sample(n, random_state=self.seed)
    display(sample_df)

def plot_entire_dataset_summary(self):
    sns.pairplot(self.df, hue="Major")  # uses all numeric columns automatically
    add_username_watermark()            # optional watermark
    plt.show()


# --- Attach the methods to your existing class ---
StudentDataset.plot_gender_distribution = plot_gender_distribution
StudentDataset.plot_major_distribution = plot_major_distribution
StudentDataset.plot_program_distribution = plot_program_distribution
StudentDataset.plot_gpa_distribution = plot_gpa_distribution
StudentDataset.plot_program_by_major = plot_program_by_major
StudentDataset.plot_gpa_by_major = plot_gpa_by_major
StudentDataset.plot_gpa_by_program = plot_gpa_by_program
StudentDataset.plot_gpa_by_program_and_major = plot_gpa_by_program_and_major
StudentDataset.plot_sampled_dataset = plot_sampled_dataset
StudentDataset.plot_entire_dataset_summary = plot_entire_dataset_summary

# --- Example usage ---
dataset.plot_gender_distribution()
dataset.plot_major_distribution()
dataset.plot_program_distribution()
dataset.plot_gpa_distribution()
dataset.plot_program_by_major()
dataset.plot_gpa_by_major()
dataset.plot_gpa_by_program()
dataset.plot_gpa_by_program_and_major()
dataset.plot_sampled_dataset()
dataset.plot_entire_dataset_summary()


#### (b) GPA Summary Statistics

In [None]:
def gpa_mean_std(self) -> tuple[float, float]:
    """
    Computes and returns the mean and standard deviation of GPA.

    Returns:
        tuple: (mean_gpa, std_gpa)
    """
    mean_gpa = self.df["GPA"].mean()
    std_gpa = self.df["GPA"].std()
    return mean_gpa, std_gpa

StudentDataset.gpa_mean_std = gpa_mean_std

mean_gpa, std_gpa = dataset.gpa_mean_std()
print(f"Mean GPA: {mean_gpa:.2f}")
print(f"Standard Deviation of GPA: {std_gpa:.2f}")


**Mean GPA:** 7.33  
**Standard Deviation of GPA:** 1.04  

**Observations:**

- On average, students in the dataset have a GPA of 7.33 out of 10, indicating that most students are performing slightly above the midpoint but below the top end.  
- The GPA values have a standard deviation of 1.04, showing a moderate variation in student performance; most GPAs lie roughly between 6.3 and 8.3 (mean ± 1 std).  
- The dataset exhibits fairly consistent academic performance with a moderate spread.  
- There may be some outliers at the lower end (clipped at 4.0) and higher end (clipped at 10.0), but the majority of students are clustered around the mean.


#### (c) Program-Major Combinations

In [None]:
def count_students_per_program_major_pair(self) -> pd.DataFrame:
    """
    Count the number of students for each unique (Program, Major) pair.
    
    Returns:
        pd.DataFrame: A DataFrame with Programs as rows, Majors as columns,
                      and counts as values.
    """
    counts_df = (
        self.df.groupby(["Program", "Major"])
        .size()
        .unstack(fill_value=0)
    )
    return counts_df


def visualize_students_per_program_major_pair(self, counts_df: pd.DataFrame) -> None:
    """
    Visualize the (Program, Major) student counts using a heatmap.
    
    Args:
        counts_df (pd.DataFrame): DataFrame with counts of students per Program-Major pair.
    """
    plt.figure(figsize=(8, 6))
    sns.heatmap(counts_df, annot=True, fmt="d", cmap="Blues", cbar=True)

    plt.title("Number of Students per Program and Major", fontsize=14)
    plt.xlabel("Major", fontsize=12)
    plt.ylabel("Program", fontsize=12)

    # Add label for colorbar
    cbar = plt.gca().collections[0].colorbar
    cbar.set_label("Number of Students")

    # Add username watermark
    add_username_watermark()

    plt.show()

StudentDataset.count_students_per_program_major_pair = count_students_per_program_major_pair

counts_df = dataset.count_students_per_program_major_pair()
print(counts_df)

StudentDataset.visualize_students_per_program_major_pair = visualize_students_per_program_major_pair

dataset.visualize_students_per_program_major_pair(counts_df)

### Q1.2 Simple vs Stratified Sampling

In [None]:
import numpy as np
import pandas as pd

def get_gpa_mean_std_random(self, n: int = 500, repeats: int = 50) -> tuple[float, float]:
    """
    Simple random sampling (without replacement).
    Repeats `repeats` times, each time sampling `n` students uniformly from the full dataset.
    For each sample, compute the sample mean and sample std (ddof=1) of GPA.
    
    Returns:
        (mean_of_means, mean_of_stds)
        - mean_of_means: average of the 50 sample means (estimator of population mean)
        - mean_of_stds : average of the 50 sample standard deviations (typical within-sample spread)
    """
    N = len(self.df)
    if n > N:
        raise ValueError("n cannot exceed the dataset size.")
    
    gpas = self.df["GPA"].to_numpy()
    means, stds = [], []
    
    for _ in range(repeats):
        idx = self.rng.choice(N, size=n, replace=False)
        sample = gpas[idx]
        means.append(float(sample.mean()))
        stds.append(float(sample.std(ddof=1)))
    
    mean_of_means = float(np.mean(means))
    mean_of_stds  = float(np.mean(stds))
    return mean_of_means, mean_of_stds


def _proportional_allocations(self, n: int, groups: pd.Series) -> dict:
    """
    Helper: compute integer allocations per group proportional to group sizes.
    Ensures allocations sum exactly to n using largest-remainder (Hamilton) rounding.
    """
    counts = groups.value_counts().sort_index()
    proportions = counts / counts.sum()
    raw = proportions * n
    floor_alloc = np.floor(raw).astype(int)
    remainder = n - floor_alloc.sum()
    
    # Distribute leftover by largest fractional parts
    frac = (raw - floor_alloc).sort_values(ascending=False)
    alloc = floor_alloc.copy()
    for grp in frac.index[:remainder]:
        alloc.loc[grp] += 1
    return alloc.to_dict()


def get_gpa_mean_std_stratified(self, n: int = 500, repeats: int = 50) -> tuple[float, float]:
    """
    Stratified sampling by 'Major' (without replacement within each stratum).
    Uses proportional allocation (rounded via largest-remainder) each repetition.
    For each stratified sample, compute sample mean and sample std (ddof=1) of GPA.
    
    Returns:
        (mean_of_means, mean_of_stds)
        - mean_of_means: average of the 50 stratified sample means
        - mean_of_stds : average of the 50 stratified sample standard deviations
    """
    if "Major" not in self.df.columns:
        raise ValueError("Column 'Major' not found in dataset.")
    N = len(self.df)
    if n > N:
        raise ValueError("n cannot exceed the dataset size.")
    
    # Pre-index rows per major for efficient sampling
    majors = self.df["Major"]
    by_major_indices = {m: self.df.index[majors == m].to_numpy() for m in majors.unique()}
    # Compute integer allocations per major (fixed across repeats given fixed n & population)
    alloc = self._proportional_allocations(n, majors)
    
    gpas = self.df["GPA"].to_numpy()
    means, stds = [], []
    
    for _ in range(repeats):
        chosen_idx_list = []
        for m, k in alloc.items():
            pool = by_major_indices[m]
            if k > len(pool):
                raise ValueError(f"Allocation {k} exceeds stratum size for major {m}.")
            pick = self.rng.choice(pool, size=k, replace=False)
            chosen_idx_list.append(pick)
        
        chosen_idx = np.concatenate(chosen_idx_list)
        sample = gpas[chosen_idx]
        means.append(float(sample.mean()))
        stds.append(float(sample.std(ddof=1)))
    
    mean_of_means = float(np.mean(means))
    mean_of_stds  = float(np.mean(stds))
    return mean_of_means, mean_of_stds


# Attach to class
StudentDataset.get_gpa_mean_std_random = get_gpa_mean_std_random
StudentDataset._proportional_allocations = _proportional_allocations  # helper
StudentDataset.get_gpa_mean_std_stratified = get_gpa_mean_std_stratified

# Example usage
random_means, random_stds = dataset.get_gpa_mean_std_random()
print("Random Sampling:")
print(f"Mean GPA: {random_means}, Std GPA: {random_stds}")

stratified_means, stratified_stds = dataset.get_gpa_mean_std_stratified()
print("\nStratified Sampling:")
print(f"Mean GPA: {stratified_means}, Std GPA: {stratified_stds}")

#### Q1.2 Simple vs Stratified Sampling

We conducted two procedures, each repeated 50 times with sample size \(n=500\):

1. **Simple Random Sampling (SRS)**: 500 students drawn uniformly without replacement from the entire dataset.
2. **Stratified Sampling by Major**: Each sample of 500 was formed by sampling within each major in proportion to that major’s population share (with largest-remainder rounding), without replacement within strata.

For every repetition, we computed:
- **Sample mean GPA**
- **Sample standard deviation of GPA (ddof=1)**

We then reported, for each method:
- **Mean of sample means** (estimator of the population mean GPA)
- **Mean of sample standard deviations** (typical within-sample spread)

**Results (example format):**
- SRS → mean of means: …, mean of stds: …
- Stratified → mean of means: …, mean of stds: …

**Which method has lower std deviation and why?**  
Stratified sampling typically yields **lower variance of the mean estimator** when the strata (here, majors) have **different means and/or different within-stratum variances**. By ensuring each major is represented proportionally in every sample, stratification reduces the sampling variability due to fluctuations in the composition of majors. Since our data generation assigns different GPA distributions by major (e.g., B.Tech \( \sim \mathcal{N}(7.0, 1.0)\), MS \( \sim \mathcal{N}(8.0, 0.7)\), PhD \( \sim \mathcal{N}(8.3, 0.5)\)), stratification controls for this heterogeneity, leading to a **lower standard error of the sample mean** across repeats compared to simple random sampling.


### Q1.3 Gender-Balanced Cohort