In [None]:
import numpy as np
import pandas as pd
import os
import kagglehub # type: ignore
import shutil

# Define Paths
destination_dir = 'input'
# Assuming the dataset will always contain 'Iris.csv'
file_name = 'Iris.csv'
destination_file = os.path.join(destination_dir, file_name)

# Create destination directory
os.makedirs(destination_dir, exist_ok=True)

# Check if the file already exists
if os.path.exists(destination_file):
    print(f"File '{destination_file}' already exists. Skipping download and move.")
else:
    # Download and move to input dir
    print("Downloading dataset...")
    try:
        path = kagglehub.dataset_download("endofnight17j03/iris-classification")
        print(f"Dataset downloaded to: {path}")

        source_file = os.path.join(path, file_name)
        
        if not os.path.exists(source_file):
             raise FileNotFoundError(f"CSV file '{file_name}' not found in downloaded path: {path}")

        print(f"Moving '{source_file}' to '{destination_file}'...")
        shutil.move(source_file, destination_file)
        print("Move complete.")

    except Exception as e:
        print(f"An error occurred: {e}")

# Verify the file is in the new directory 
print("\nFiles in the 'input' directory:")
if os.path.exists(destination_dir):
    for dirname, _, filenames in os.walk(destination_dir):
        for filename in filenames:
            print(os.path.join(dirname, filename))
else:
    print("Input directory does not exist.")

File 'input\Iris.csv' already exists. Skipping download and move.

Files in the 'input' directory:
input\Iris.csv


  from .autonotebook import tqdm as notebook_tqdm


# Softmax Regression

The purporse of this notebook is to create a model using softmax regression 

What is softmax regression? 

Softmax regression id a generalization of logistic regression as to support multiclass prediction natively, thus while logistic regression would give us the probability of being in a class or not in a class for every feature in our target, softmax provides us with a single set of probabilities with our predicted class being the class with the highest probability. Softmax is recommended when the classes that we are trying to predict are mutually exclusive.

We would like to implement mini-batch gradient descent to use to train or classifier. This is identical to batch GD the only difference beign that we train on a batch of m instances in each iteration instead of the whole dataset, this is useful when implementing out of core learning and when dealing with very large datasets. We will start off with that before taking a look at our algorithm.

As to properly implement, illustrate and take advantage of minibatch learning, we will download our dataset from sklearn instead of just importing it, I guess if it is large enough it is downloaded anyway. Thus we will not directly have it in our RAM and must read from file. The dataset that I will using this on is the iris dataset which is commically small compared to other irl datasets, but the fundamentals still apply, so let us get into it

In [None]:
from collections import defaultdict
import pathlib
from sklearn.preprocessing import StandardScaler



class MiniBatchGD():
    """
    Implementation of the Batch Gradient Descent algorithm with automatic calling of the 
    MiniBachSample class for the computation of MiniBatches
    """
    def __init__(self,path:str|pathlib.Path):
        self.coef_: np.ndarray
        self.n_features_in_: int
        self.feature_names_in_: np.ndarray
        self.path:str|pathlib.Path

    def fit(self, X_index:np.ndarray,y_index:np.ndarray, path: str|pathlib.Path, lines:int, epochs: int = 100, eta:float=0.1, gain:float=0.1):
        '''Assuming that all features are numeric, just a fine assumption for the iris dataset,
        don't feel like extending it right now, may implement it in a differen way in the future possibly
        
        Should have used my own standard scaler since itis pure numpy, will save me some time 

        '''
        sampler = MiniBatchSampler()
        df = sampler.sample(path,lines)
        if df.dtypes.iloc[y_index] == 'object':
            categories = df.iloc[:,y_index]
            category_map: dict[str,int] = defaultdict(int)
            for index, cat in enumerate(categories):
                # We assign the first index we see with the value as the ordinal encoding
                # We will have to normalize this in the end
                if str(cat) in category_map:
                    df.iloc[index, y_index] = category_map[str(cat)]
                else:
                    category_map[str(cat)] = index
                    df.iloc[index, y_index] = category_map[str(cat)]
        
        # Scale features and target
        # I will permit myself to use the standard scaler here as I have already implemented it and don't 
        # want to create a new package, neither do I want to copy the code here

        sclr = StandardScaler()

        # Ok, so granted I hadn't implemented partial fit yet, 
        # And I am tired, I'll give myself this
        dataset = df.to_numpy()
        sclr.partial_fit(dataset)
        dataset = sclr.transform(dataset)
        X,y = dataset[:,:y_index],df.to_numpy()[:,y_index]
        rng = np.random.default_rng()
        theta = rng.random(size=(X.shape[1],1))
        

        





class MiniBatchSampler():
    """
    MiniBatchSampler class for mini-batch sampling from large CSV files.

    This class enables loading random subsets (mini-batches) of data from a CSV file
    without reading the entire file into memory. It uses reservoir sampling for the
    initial batch and generates random samples for subsequent batches.

    Attributes:
        sample_ (pd.DataFrame): The current mini-batch sample.
        columns (pd.Index): Column names of the dataset.
        DataFrame (pd.DataFrame): DataFrame holding the current batch.
        length (int): Total number of data lines in the CSV (excluding header).

    Methods:
        init_df(data): Initialize or update the DataFrame with new data.
        sample(path, lines, header): Sample a mini-batch of lines from the CSV file.
    """
    # The goal of mini-batch is to load a random subset of lines.
    # We approach the problem with a slightly modified version of reservoir sampling.
    # In the first pass, we create a sample and count the total lines simultaneously.
    # In subsequent passes, we use the known line count to efficiently seek to random lines.

    def __init__(self):
        self.columns: pd.Index|None = None
        self.DataFrame: pd.DataFrame = pd.DataFrame()
        self.length: int = 0
        self.path: str|pathlib.Path = str()
        self.header: bool = True
        self.lines: int = 0
        self.rng:np.random.Generator = np.random.default_rng(42)


    def init_df(self, data: list[list[str]]):
        '''
        Initialize or update the DataFrame with new data.
        '''
        self.DataFrame = pd.DataFrame(data, columns=self.columns)

    def sample(self, path: str|pathlib.Path, lines: int, header: bool = True) -> pd.DataFrame:
        '''
        This is an alright approach for my dataset, for others just use pd.read_csv
        for more robust string parsing.
        '''
        rng = self.rng
        self.path = path
        self.header = header
        self.lines = lines
        first_pass = self.DataFrame.empty or self.length == 0
        with open(path, 'r', encoding="utf-8") as f:
            # First pass: Use Reservoir Sampling for a true one-pass initial sample.
            if first_pass:
                if header:
                    columns_line = f.readline()
                    self.columns = pd.Index(columns_line.strip().split(','))
                
                reservoir: list[list[str]] = []
                line_count = 0
                for line in f:
                    line_count += 1
                    # Fill the reservoir with the first `lines` items
                    if len(reservoir) < lines:
                        reservoir.append(line.strip().split(','))
                    else:
                        # For subsequent items, replace an existing item with a decreasing probability
                        j = rng.integers(0, line_count)
                        if j < lines:
                            reservoir[j] = line.strip().split(',')
                
                self.length = line_count
                self.init_df(reservoir)
                return self.DataFrame
            else:
                # We know how many lines we have, so it is more efficient to just generate random numbers
                # and go to them (in the best case). We will implement a one-pass sampling to fetch the required lines.

                # 1. Generate `lines` unique random indices to fetch from the `self.length` total data lines.
                # We sort them to be able to read the file in a single forward pass.
                indices_to_read = sorted(rng.choice(self.length, size=lines, replace=False))

                new_data: list[list[str]] = []
                indices_ptr = 0

                # 2. Go to the start of the file and read the lines at the chosen indices.
                f.seek(0)
                if header:
                    f.readline()  # Skip the header row

                for i, line in enumerate(f):
                    if indices_ptr < len(indices_to_read) and i == indices_to_read[indices_ptr]:
                        # This is a line we want, so we parse it and add it to our new data
                        line_data = line.strip().split(',')
                        new_data.append(line_data)
                        indices_ptr += 1
                    
                    if indices_ptr == len(indices_to_read):
                        # If we have found all our lines, we can stop reading the file.
                        break
                
                # 3. Replace the content of the DataFrame with the new batch.
                self.init_df(new_data)
                return self.DataFrame

    def resample(self) -> pd.DataFrame:
        if self.length < self.lines:
            return self.DataFrame

        with open(self.path,'r',encoding='utf-8') as f:
            # 1. Generate `lines` unique random indices to fetch from the `self.length` total data lines.
            # We sort them to be able to read the file in a single forward pass.
            indices_to_read = sorted(self.rng.choice(self.length, size=self.lines, replace=False))
            
            new_data: list[list[str]] = []
            indices_ptr = 0
            # 2. Go to the start of the file and read the lines at the chosen indices.
            f.seek(0)
            if self.header:
                f.readline()  # Skip the header row
            for i, line in enumerate(f):
                if indices_ptr < len(indices_to_read) and i == indices_to_read[indices_ptr]:
                    # This is a line we want, so we parse it and add it to our new data
                    line_data = line.strip().split(',')
                    new_data.append(line_data)
                    indices_ptr += 1
                
                if indices_ptr == len(indices_to_read):
                    # If we have found all our lines, we can stop reading the file.
                    break
            
            # 3. Replace the content of the DataFrame with the new batch.
            self.init_df(new_data)
            return self.DataFrame

        

