# Lab 1: Data Splitting, Preprocessing, and Regression

In [None]:
import numpy as np
import matplotlib.pyplot as plt

## SplitData Function

This function splits a NumPy array into training and validation sets.

In [None]:
def SplitData(data, split_ratio):
  """
  Splits a NumPy array into training and validation sets.

  Args:
    data: A numpy.ndarray to be split.
    split_ratio: The ratio for splitting the data (e.g., 0.8 for 80% training).

  Returns:
    A tuple containing two numpy.ndarrays: (training_data, validation_data).
  """
  # Ensure data is a NumPy array
  if not isinstance(data, np.ndarray):
    data = np.array(data)
  split_index = int(len(data) * split_ratio)
  training_data = data[:split_index]
  validation_data = data[split_index:]
  return training_data, validation_data

### Example Usage for SplitData

In [None]:
# Create a sample dataset for SplitData
sample_data_split = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]])
split_ratio_val = 0.8

# Split the data
training_set, validation_set = SplitData(sample_data_split, split_ratio_val)

# Print the results
print("Original Data for SplitData:\n", sample_data_split)
print("Training Data:\n", training_set)
print("Validation Data:\n", validation_set)

## PreprocessData Function

This function preprocesses a NumPy array by handling missing values and outliers for numerical columns.

In [None]:
def PreprocessData(data):
  """
  Preprocesses a NumPy array by removing rows with NaN values and removing outliers based on IQR.

  Args:
    data: A numpy.ndarray to be preprocessed. Assumes numerical data.
          Can be 1D (single feature) or 2D (multiple features or feature and target).

  Returns:
    A numpy.ndarray with missing values and outliers removed.
    If input was 1D, output is 2D with one column.
  """
  if not isinstance(data, np.ndarray):
    data = np.array(data, dtype=float)
  else:
    data = data.astype(float)

  if data.ndim == 1:
    data = data.reshape(-1, 1)

  data_no_nan = data[~np.isnan(data).any(axis=1)]
  
  if data_no_nan.size == 0:
      print("Warning: All rows removed after NaN handling or data was initially empty.")
      return data_no_nan

  Q1 = np.percentile(data_no_nan, 25, axis=0)
  Q3 = np.percentile(data_no_nan, 75, axis=0)
  IQR = Q3 - Q1
  
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  
  non_outlier_mask = np.all((data_no_nan >= lower_bound) & (data_no_nan <= upper_bound), axis=1)
  preprocessed_data = data_no_nan[non_outlier_mask]
  
  if preprocessed_data.size == 0 and data_no_nan.size > 0:
      print("Warning: All rows removed after outlier handling.")
      
  return preprocessed_data

### Example Usage for PreprocessData

In [None]:
sample_data_preprocess = np.array([
    [1, 2, 3],
    [4, np.nan, 6],
    [7, 8, 9],
    [10, 11, 100],
    [13, 14, 15],
    [-50, 16, 17]
], dtype=float)
print("Original Data for Preprocessing:\n", sample_data_preprocess)
preprocessed_set = PreprocessData(sample_data_preprocess)
print("Preprocessed Data (2D):\n", preprocessed_set)

sample_data_1d = np.array([1, 2, 3, 4, 5, 100, np.nan])
print("\nOriginal 1D Data for Preprocessing:\n", sample_data_1d)
preprocessed_set_1d = PreprocessData(sample_data_1d)
print("Preprocessed 1D Data (becomes 2D):\n", preprocessed_set_1d)

## Regression Function (Polynomial with Gradient Descent)

This function performs polynomial regression using gradient descent.

In [None]:
def Regression(dataset, degree=2, num_iteration=1000, learning_rate=0.01):
    """
    Performs polynomial regression using gradient descent.
    Assumes dataset's first column is feature X, second is target y.
    """
    if not isinstance(dataset, np.ndarray):
        dataset = np.array(dataset, dtype=float)
    if dataset.ndim == 1 or dataset.shape[1] < 2:
        raise ValueError("Dataset for regression must be 2D with at least two columns (features and target).")
    
    X_feature = dataset[:, 0]
    y = dataset[:, 1]
    m = len(y)

    X_poly = np.ones((m, 1))
    for p in range(1, degree + 1):
        X_p = X_feature**p
        X_poly = np.concatenate((X_poly, X_p.reshape(-1, 1)), axis=1)

    w = np.zeros(degree + 1)
    for i in range(num_iteration):
        y_pred = X_poly @ w
        cost = np.mean((y_pred - y)**2)
        gradient = (2/m) * X_poly.T @ (y_pred - y)
        w = w - learning_rate * gradient
        if num_iteration > 10 and ((i + 1) % (num_iteration // 10) == 0 or i == num_iteration -1):
            print(f"Iteration {i+1}/{num_iteration}, Cost: {cost:.4f}")
        elif num_iteration <= 10:
             print(f"Iteration {i+1}/{num_iteration}, Cost: {cost:.4f}")
    return w

### Example Usage for Regression

In [None]:
np.random.seed(42)
X_sample_reg = np.sort(np.random.rand(50) * 10 - 5)
y_sample_reg = 0.5 * X_sample_reg**2 + X_sample_reg + 2 + np.random.randn(50) * 5 
sample_reg_dataset = np.vstack((X_sample_reg, y_sample_reg)).T
print("Sample Regression Dataset shape:", sample_reg_dataset.shape)
degree_val_ex = 2
iterations_val_ex = 10000
lr_val_ex = 0.001 
learned_weights_ex = Regression(sample_reg_dataset, degree=degree_val_ex, num_iteration=iterations_val_ex, learning_rate=lr_val_ex)
print("\nLearned Weights (example):")
print(learned_weights_ex)

plt.figure(figsize=(10, 6))
plt.scatter(X_sample_reg, y_sample_reg, label='Original Data', color='blue', alpha=0.7)
X_plot_ex = np.linspace(min(X_sample_reg), max(X_sample_reg), 100)
X_plot_poly_ex = np.ones((len(X_plot_ex), 1))
for p_ex in range(1, degree_val_ex + 1):
    X_p_plot_ex = X_plot_ex**p_ex
    X_plot_poly_ex = np.concatenate((X_plot_poly_ex, X_p_plot_ex.reshape(-1, 1)), axis=1)
y_plot_ex = X_plot_poly_ex @ learned_weights_ex
plt.plot(X_plot_ex, y_plot_ex, label=f'Fitted Polynomial (Degree {degree_val_ex})', color='red', linewidth=2)
plt.xlabel('X')
plt.ylabel('y')
plt.title('Polynomial Regression with Gradient Descent Example')
plt.legend()
plt.grid(True)
plt.show()

## MakePrediction Function (for Polynomial Regression)

Uses learned weights from `Regression` to make predictions.

In [None]:
def MakePrediction(w, test_features):
    """
    Makes predictions on test data using learned polynomial regression weights.
    Assumes test_features is a 1D array of the single feature X.
    """
    if not isinstance(w, np.ndarray):
        w = np.array(w)
    if not isinstance(test_features, np.ndarray):
        test_features = np.array(test_features, dtype=float)
    elif test_features.ndim == 0:
        test_features = np.array([test_features], dtype=float)
    
    degree = len(w) - 1
    m_test = len(test_features)

    X_poly_test = np.ones((m_test, 1))
    for p in range(1, degree + 1):
        X_p_test = test_features**p
        X_poly_test = np.concatenate((X_poly_test, X_p_test.reshape(-1, 1)), axis=1)

    predictions = X_poly_test @ w
    return predictions

### Example Usage for MakePrediction

In [None]:
if 'learned_weights_ex' in globals() and 'X_sample_reg' in globals() and 'y_sample_reg' in globals():
    test_data_points_ex = np.array([-4, 0, 4])
    predictions_on_test_ex = MakePrediction(learned_weights_ex, test_data_points_ex)
    print(f"Test Data Points (example): {test_data_points_ex}")
    print(f"Predictions on Test Data (example): {predictions_on_test_ex}")

    plt.figure(figsize=(10, 6))
    plt.scatter(X_sample_reg, y_sample_reg, label='Original Data', color='blue', alpha=0.7) 
    if 'X_plot_ex' in globals() and 'y_plot_ex' in globals():
        plt.plot(X_plot_ex, y_plot_ex, label=f'Fitted Polynomial (Degree {len(learned_weights_ex)-1})', color='red', linewidth=2)
    plt.scatter(test_data_points_ex, predictions_on_test_ex, label='New Predictions', color='green', s=100, marker='x', zorder=5)
    plt.xlabel('X')
    plt.ylabel('y')
    plt.title('Polynomial Regression Example: Fitted Curve and New Predictions')
    plt.legend()
    plt.grid(True)
    plt.show()
else:
    print("Please run the Regression example cell first to generate 'learned_weights_ex'.")

## Train Model and Generate Result (Basic Part)

This section orchestrates the loading of data, preprocessing, model training, prediction, and result generation for the basic part of the lab.

### 1. Load Data

In [None]:
training_file_basic = 'lab1_basic_training.csv'
testing_file_basic = 'lab1_basic_testing.csv'
output_dataroot_basic = 'lab1_basic.csv'

try:
    training_datalist_basic = np.genfromtxt(training_file_basic, delimiter=',', skip_header=1)
    raw_testing_datalist_basic = np.genfromtxt(testing_file_basic, delimiter=',', skip_header=1)
    if raw_testing_datalist_basic.ndim > 1 and raw_testing_datalist_basic.shape[1] > 0:
        testing_datalist_basic_features = raw_testing_datalist_basic[:, 0]
    else:
        testing_datalist_basic_features = raw_testing_datalist_basic
    print(f"Successfully loaded {training_file_basic} and {testing_file_basic}.")
except FileNotFoundError:
    print(f"Warning: Basic data files not found. Using dummy data for basic part.")
    training_datalist_basic = np.array([[1,2.5],[2,3.5],[3,4.8],[4,6.1],[5,7.0],[6,8.2],[7,9.5],[8,10.3],[9,11.8],[10,13.2]])
    testing_datalist_basic_features = np.array([11, 12, 13.5, 14.2, 15.0])

print("Initial basic training_datalist shape:", training_datalist_basic.shape)
print("Initial basic testing_datalist features shape:", testing_datalist_basic_features.shape)

### 2. Split Data (Basic)

In [None]:
split_ratio_basic = 0.8
train_data_basic, val_data_basic = SplitData(training_datalist_basic, split_ratio_basic)
print(f"Basic training data shape: {train_data_basic.shape}")
print(f"Basic validation data shape: {val_data_basic.shape}")

### 3. Preprocess Data (Basic)

In [None]:
processed_train_data_basic = PreprocessData(train_data_basic)
print(f"Processed basic training data shape: {processed_train_data_basic.shape}")
if processed_train_data_basic.size == 0:
    raise ValueError("Basic training data is empty after preprocessing.")

processed_val_data_basic = PreprocessData(val_data_basic)
print(f"Processed basic validation data shape: {processed_val_data_basic.shape}")

processed_testing_datalist_basic_features = PreprocessData(testing_datalist_basic_features)
print(f"Processed basic testing features shape: {processed_testing_datalist_basic_features.shape}")
if processed_testing_datalist_basic_features.size == 0:
    raise ValueError("Basic testing data is empty after preprocessing.")

### 4. Train Regression Model (Basic)

In [None]:
degree_basic_train = 2 
iterations_basic_train = 10000 
lr_basic_train = 0.001
print("Training basic regression model...")
w_basic = Regression(processed_train_data_basic, degree=degree_basic_train, num_iteration=iterations_basic_train, learning_rate=lr_basic_train)
print("Learned weights for basic model (w_basic):")
print(w_basic)

### 5. Predict on Validation Set and Calculate MAPE (Basic)

In [None]:
if processed_val_data_basic.size > 0 and processed_val_data_basic.shape[1] >= 2:
    val_features_basic = processed_val_data_basic[:, 0]
    val_labels_basic = processed_val_data_basic[:, 1]
    val_predictions_basic = MakePrediction(w_basic, val_features_basic)

    try:
        non_zero_mask_basic = val_labels_basic != 0
        if np.any(non_zero_mask_basic):
            mape_basic = np.mean(np.abs((val_labels_basic[non_zero_mask_basic] - val_predictions_basic[non_zero_mask_basic]) / val_labels_basic[non_zero_mask_basic])) * 100
            print(f"MAPE on Basic Validation Set (excluding zero labels): {mape_basic:.2f}%")
        else:
            print("Warning: All basic validation labels are zero. MAPE is undefined.")
        if np.any(val_labels_basic == 0):
             print("Note: Some basic validation labels were zero and excluded from MAPE calculation.")
    except Exception as e:
        print(f"An error occurred during basic MAPE calculation: {e}")
else:
    print("Basic validation data is empty or invalid. Skipping MAPE calculation.")

### 6. Make Prediction on Testing Dataset (Basic)

In [None]:
test_features_basic_final = processed_testing_datalist_basic_features[:, 0]
output_datalist_basic = MakePrediction(w_basic, test_features_basic_final)
print("Predictions on basic testing dataset (output_datalist_basic):")
print(output_datalist_basic)

### 7. Write Output File (Basic)

In [None]:
if 'output_datalist_basic' in globals() and 'output_dataroot_basic' in globals():
    ids_basic = np.arange(1, len(output_datalist_basic) + 1)
    output_array_basic = np.vstack((ids_basic, output_datalist_basic)).T
    np.savetxt(output_dataroot_basic, output_array_basic, delimiter=',', header='Id,gripForce', fmt=['%d', '%.6f'], comments='')
    print(f"Basic predictions saved to {output_dataroot_basic}")
else:
    print("Error: Basic output data not defined. Cannot save basic predictions.")

## Train Model and Generate Result (Advanced Part)

This section handles multi-feature regression using matrix inversion, including a categorical feature (gender).

### 1. Load Advanced Data

In [None]:
training_file_adv = 'lab1_advanced_training.csv'
testing_file_adv = 'lab1_advanced_testing.csv'
output_dataroot_adv = 'lab1_advanced.csv'

try:
    # Expecting columns: feature1, feature2, gender, gripForce
    training_datalist_adv = np.genfromtxt(training_file_adv, delimiter=',', skip_header=1)
    # Expecting columns: feature1, feature2, gender
    testing_datalist_adv = np.genfromtxt(testing_file_adv, delimiter=',', skip_header=1)
    print(f"Successfully loaded {training_file_adv} and {testing_file_adv}.")
except FileNotFoundError:
    print(f"Warning: Advanced data files not found. Using dummy data for advanced part.")
    # Dummy training_datalist_adv (feature1, feature2, gender (0/1), gripForce)
    training_datalist_adv = np.array([
        [25, 170, 0, 30.5], [30, 165, 1, 25.2], [22, 175, 0, 35.0], [45, 155, 1, 22.8],
        [28, 180, 0, 38.1], [35, 160, 1, 28.0], [40, 170, 0, 33.5], [20, 185, 0, 40.2],
        [50, 150, 1, 20.0], [33, 172, 0, 32.8], [np.nan, 160, 1, 21.0], [30, 250, 0, 55.0] # Nan and outlier
    ])
    # Dummy testing_datalist_adv (feature1, feature2, gender (0/1))
    testing_datalist_adv = np.array([
        [26, 172, 0], [32, 168, 1], [24, 178, 0], [48, 158, 1], [30, 182, 0], [150, 160, 0] # Outlier
    ])

print("Initial advanced training_datalist shape:", training_datalist_adv.shape)
print("Initial advanced testing_datalist shape:", testing_datalist_adv.shape)

### 2. Preprocess Data (Advanced)

In [None]:
def PreprocessDataAdvanced(data, gender_column_index=2):
    """
    Preprocesses advanced dataset: handles NaNs and outliers for numerical columns,
    excluding the gender column from outlier removal.
    Assumes gender column is already numerically encoded (e.g., 0/1).
    """
    if not isinstance(data, np.ndarray):
        data = np.array(data, dtype=float)
    else:
        data = data.astype(float)

    # Handle NaNs by row removal (applied to all columns)
    data_no_nan = data[~np.isnan(data).any(axis=1)]
    if data_no_nan.size == 0:
        print("Warning: All rows removed after NaN handling or data was initially empty.")
        return data_no_nan

    # Separate numerical columns for outlier detection
    # Create a mask for numerical columns to apply IQR
    num_cols = data_no_nan.shape[1]
    numerical_cols_mask = np.ones(num_cols, dtype=bool)
    if 0 <= gender_column_index < num_cols:
        numerical_cols_mask[gender_column_index] = False # Exclude gender column from IQR
    
    data_numerical_cols = data_no_nan[:, numerical_cols_mask]
    
    if data_numerical_cols.size > 0: # Proceed only if there are numerical columns to process
        Q1 = np.percentile(data_numerical_cols, 25, axis=0)
        Q3 = np.percentile(data_numerical_cols, 75, axis=0)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Create a boolean mask for non-outliers for numerical columns only
        non_outlier_numerical_mask = np.all((data_numerical_cols >= lower_bound) & (data_numerical_cols <= upper_bound), axis=1)
        
        # Combine with original data (including gender column which was not checked for outliers)
        preprocessed_data = data_no_nan[non_outlier_numerical_mask]
    else: # No numerical columns to apply IQR to (e.g. only gender column or empty data)
        preprocessed_data = data_no_nan

    if preprocessed_data.size == 0 and data_no_nan.size > 0:
        print("Warning: All rows removed after outlier handling.")
        
    return preprocessed_data

# Assuming gender is the 3rd column (index 2) in training_datalist_adv and testing_datalist_adv
gender_col_idx_adv = 2 
processed_training_adv = PreprocessDataAdvanced(training_datalist_adv, gender_column_index=gender_col_idx_adv)
print(f"Processed advanced training data shape: {processed_training_adv.shape}")
if processed_training_adv.size == 0:
    raise ValueError("Advanced training data is empty after preprocessing.")

processed_testing_adv = PreprocessDataAdvanced(testing_datalist_adv, gender_column_index=gender_col_idx_adv)
print(f"Processed advanced testing data shape: {processed_testing_adv.shape}")
if processed_testing_adv.size == 0:
    raise ValueError("Advanced testing data is empty after preprocessing.")

### 3. Split Data (Advanced)

In [None]:
split_ratio_adv = 0.8
train_data_adv, val_data_adv = SplitData(processed_training_adv, split_ratio_adv)
print(f"Advanced training data shape: {train_data_adv.shape}")
print(f"Advanced validation data shape: {val_data_adv.shape}")

### 4. Train Advanced Regression Model (Matrix Inversion)

In [None]:
def RegressionMatrixInversion(dataset):
    """
    Performs multiple linear regression using matrix inversion (Normal Equation).
    Assumes dataset has features first, and the last column is the target variable y.
    """
    if not isinstance(dataset, np.ndarray) or dataset.ndim != 2 or dataset.shape[1] < 2:
        raise ValueError("Dataset must be a 2D NumPy array with at least two columns (features + target).")
    
    X = dataset[:, :-1]  # All columns except the last are features
    y = dataset[:, -1]   # The last column is the target
    
    # Add intercept term (column of ones) to X
    X_intercept = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
    
    # Calculate weights using pseudo-inverse for numerical stability
    # w = (X^T * X)^(-1) * X^T * y
    try:
        w = np.linalg.pinv(X_intercept.T @ X_intercept) @ X_intercept.T @ y
    except np.linalg.LinAlgError:
        print("Error: Singular matrix encountered. Cannot compute inverse directly. Pseudo-inverse also failed.")
        # Fallback or re-throw, here we'll let pinv potentially handle it or error out if it can't
        raise 
    return w

if train_data_adv.size > 0 and train_data_adv.shape[1] > 1:
    print("Training advanced regression model using matrix inversion...")
    w_advanced = RegressionMatrixInversion(train_data_adv)
    print("Learned weights for advanced model (w_advanced - intercept first):")
    print(w_advanced)
else:
    print("Advanced training data is empty or invalid. Skipping model training.")
    w_advanced = None # Ensure w_advanced exists

### 5. Predict on Validation Set and Calculate MAPE (Advanced)

In [None]:
def MakePredictionAdvanced(w, test_features_dataset):
    """
    Makes predictions using weights from matrix inversion regression.
    test_features_dataset: NumPy array of features (no target column).
    """
    if not isinstance(test_features_dataset, np.ndarray):
        test_features_dataset = np.array(test_features_dataset, dtype=float)
    if test_features_dataset.ndim == 1: # If a single sample with multiple features
        test_features_dataset = test_features_dataset.reshape(1, -1)
    
    # Add intercept term
    X_intercept_test = np.concatenate((np.ones((test_features_dataset.shape[0], 1)), test_features_dataset), axis=1)
    
    predictions = X_intercept_test @ w
    return predictions

if w_advanced is not None and val_data_adv.size > 0 and val_data_adv.shape[1] > 1:
    val_features_adv = val_data_adv[:, :-1] # All columns except last
    val_labels_adv = val_data_adv[:, -1]    # Last column
    
    val_predictions_adv = MakePredictionAdvanced(w_advanced, val_features_adv)

    try:
        non_zero_mask_adv = val_labels_adv != 0
        if np.any(non_zero_mask_adv):
            mape_adv = np.mean(np.abs((val_labels_adv[non_zero_mask_adv] - val_predictions_adv[non_zero_mask_adv]) / val_labels_adv[non_zero_mask_adv])) * 100
            print(f"MAPE on Advanced Validation Set (excluding zero labels): {mape_adv:.2f}%")
        else:
            print("Warning: All advanced validation labels are zero. MAPE is undefined.")
        if np.any(val_labels_adv == 0):
            print("Note: Some advanced validation labels were zero and excluded from MAPE calculation.")
    except Exception as e:
        print(f"An error occurred during advanced MAPE calculation: {e}")
else:
    print("Advanced validation data/weights invalid. Skipping MAPE calculation.")

### 6. Make Prediction on Advanced Testing Dataset

In [None]:
if w_advanced is not None and processed_testing_adv.size > 0:
    # processed_testing_adv contains only features, as per its loading and preprocessing
    output_datalist_advanced = MakePredictionAdvanced(w_advanced, processed_testing_adv)
    print("Predictions on advanced testing dataset (output_datalist_advanced):")
    print(output_datalist_advanced)
else:
    print("Advanced testing data or weights invalid. Cannot make predictions.")
    output_datalist_advanced = np.array([]) # Ensure variable exists for saving step

### 7. Write Output File (Advanced)

In [None]:
if 'output_datalist_advanced' in globals() and output_datalist_advanced.size > 0 and 'output_dataroot_adv' in globals():
    ids_adv = np.arange(1, len(output_datalist_advanced) + 1)
    output_array_adv = np.vstack((ids_adv, output_datalist_advanced)).T
    np.savetxt(output_dataroot_adv, output_array_adv, delimiter=',', header='Id,gripForce', fmt=['%d', '%.6f'], comments='')
    print(f"Advanced predictions saved to {output_dataroot_adv}")
elif 'output_datalist_advanced' in globals() and output_datalist_advanced.size == 0:
    print("No advanced predictions to save (output_datalist_advanced is empty).")
else:
    print("Error: Advanced output data not defined. Cannot save advanced predictions.")