In [None]:
# Example: compute required seed from IIITH username
import hashlib, numpy as np

import numpy as np
import pandas as pd
import hashlib

class StudentDataset:
    """
    A class to generate and store student dataset with attributes:
    gender, major, program, and GPA.
    
    Attributes:
        num_students (int): Number of student records to generate.
        seed (int): Random seed for reproducibility.
        rng (np.random.Generator): Random number generator instance.
        df (pd.DataFrame): Generated dataset stored as a DataFrame.
    """

    def __init__(self, num_students: int, seed: int):
        """
        Initializes the dataset with the given number of students and seed.
        Generates the full dataset once during initialization.
        """
        self.num_students = num_students
        self.seed = seed
        self.rng = np.random.default_rng(seed)

        # Generate dataset
        self.df = self.assemble_dataframe()

    def get_full_dataframe(self) -> pd.DataFrame:
        """
        Returns the complete dataset as a pandas DataFrame.
        """
        return self.df

    def generate_gender(self) -> list[str]:
        """
        Generates gender distribution for students.
        Probabilities: Male (65%), Female (33%), Other (2%).
        """
        return self.rng.choice(
            ["Male", "Female", "Other"],
            size=self.num_students,
            p=[0.65, 0.33, 0.02]
        ).tolist()

    def generate_major(self) -> list[str]:
        """
        Generates major distribution for students.
        Probabilities: B.Tech (70%), MS (20%), PhD (10%).
        """
        return self.rng.choice(
            ["B.Tech", "MS", "PhD"],
            size=self.num_students,
            p=[0.70, 0.20, 0.10]
        ).tolist()

    def generate_program(self, majors: list[str]) -> list[str]:
        """
        Generates program distribution conditioned on major.
        """
        programs = []
        for major in majors:
            if major == "B.Tech":
                probs = [0.40, 0.40, 0.10, 0.10]
            elif major == "MS":
                probs = [0.30, 0.30, 0.20, 0.20]
            else:  # PhD
                probs = [0.25, 0.25, 0.25, 0.25]
            
            prog = self.rng.choice(
                ["CSE", "ECE", "CHD", "CND"],
                p=probs
            )
            programs.append(prog)
        return programs

    def generate_gpa(self, majors: list[str]) -> list[float]:
        """
        Generates GPA values based on major using normal distributions:
        - B.Tech: N(7.0, 1.0)
        - MS: N(8.0, 0.7)
        - PhD: N(8.3, 0.5)
        Values are clipped to [4.0, 10.0].
        """
        gpas = []
        for major in majors:
            if major == "B.Tech":
                gpa = self.rng.normal(7.0, 1.0)
            elif major == "MS":
                gpa = self.rng.normal(8.0, 0.7)
            else:  # PhD
                gpa = self.rng.normal(8.3, 0.5)

            # Clip GPA to range [4.0, 10.0]
            gpa = np.clip(gpa, 4.0, 10.0)
            gpas.append(round(gpa, 2))  # Round for readability
        return gpas

    def assemble_dataframe(self) -> pd.DataFrame:
        """
        Assembles the full dataset with gender, major, program, and GPA.
        """
        genders = self.generate_gender()
        majors = self.generate_major()
        programs = self.generate_program(majors)
        gpas = self.generate_gpa(majors)

        df = pd.DataFrame({
            "Gender": genders,
            "Major": majors,
            "Program": programs,
            "GPA": gpas
        })
        return df


username = "hiten.garg"   # <-- replace with the part before @
seed = int(hashlib.sha256(username.encode()).hexdigest(), 16) % (2**32)
# rng = np.random.default_rng(seed)

# Create dataset
dataset = StudentDataset(num_students=10000, seed=seed)

df = dataset.get_full_dataframe()
print(df.head())
print()
print(df.describe(include="all"))


   Gender   Major Program   GPA
0  Female  B.Tech     ECE  7.31
1    Male  B.Tech     ECE  7.68
2    Male  B.Tech     CSE  7.97
3    Male  B.Tech     ECE  6.75
4  Female      MS     ECE  7.61

       Gender   Major Program           GPA
count   10000   10000   10000  10000.000000
unique      3       3       4           NaN
top      Male  B.Tech     CSE           NaN
freq     6472    7027    3732           NaN
mean      NaN     NaN     NaN      7.328968
std       NaN     NaN     NaN      1.035183
min       NaN     NaN     NaN      4.000000
25%       NaN     NaN     NaN      6.610000
50%       NaN     NaN     NaN      7.390000
75%       NaN     NaN     NaN      8.100000
max       NaN     NaN     NaN     10.000000
