In [1]:
import numpy as np
import matplotlib.pyplot as plt
from helpers import *
from implementations import *

PCA = True
HINGE = True
NN_METHOD = False
# np.random.seed(10)

## Data Loading

In [2]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("/home/zewzhang/Course/ML/ML_course/projects/project1/data/dataset_to_release", sub_sample=True)

## Data preprocessing

Before feeding data into the regressor, we preprocessed the data through the following pipeline:
- **Data dropping**: We dropped the rows and columns which had more than 50%~70% (possible to tune) missing values.

- **Missing value imputation**: We imputed missing values in the dataset using the mean of the column if the feature is not categorical, otherwise we imputed the missing values using the majority of categorical features in that column.

- **One-hot encoding**: We one-hot encoded the categorical features in the dataset.

- **Feature scaling**: We scaled the features in the dataset in terms of the regression model we used. For logistics/hinge regression, we used standard scaler to scale the features. For neural network, we used min-max scaler to scale the features.

- **Outlier detection**: We detected outliers in the dataset using Z-score. We removed the outliers from the dataset for non-neural network models. For neural network, we did not remove the outliers.

- **Feature selection**: We selected the features using PCA for non-neural network models. For neural network, we did not do feature selection.

In [3]:
def data_processing(x_train, y_train, x_test, row_nan, feature_nan, z_threshold, feature_threshold):
    """Data processing for training and testing data"""
    
    x_train_processed = x_train.copy()
    y_train_processed = y_train.copy()

    # transform y to 0-1 encoding
    y_train_processed = process_y(y_train_processed)

    # Remove rows with too many nans
    x_train_processed, y_train_processed, row_indices = drop_rows(x_train_processed, y_train_processed, row_nan) # 0.55 remains 6101 rows
    # Remove features with too many nans
    x_train_processed, nan_indices = drop_features(x_train_processed, feature_nan) # 0.5 remains 174 features
    x_test_processed = x_test[:, nan_indices].copy()
    threshold_cat = 10

    # get categorical feature indices
    cat_indices = check_categorical(x_train_processed, threshold_cat)
    # handling remaining nans
    x_train_processed = fillna(x_train_processed, cat_indices)
    x_test_processed = fillna(x_test_processed, cat_indices)

    # One hot encoding for categorical features
    x_train_processed, x_test_processed = one_hot_encoding(x_train_processed, x_test_processed, cat_indices)

    if NN_METHOD:
        # normalize data if using NN
        x_train_processed, train_max, train_min = normalization(x_train_processed)
        x_test_processed, _, _ = normalization(x_test_processed, train_max, train_min)
    else:
        # standardize data and remove outliers if using other regression methods
        x_train_processed, train_mean, train_std = standardize(x_train_processed)
        x_test_processed = (x_test_processed - train_mean) / train_std
        # Remove outliers
        x_train_processed, y_train_processed = z_outlier_removal(x_train_processed, y_train_processed, z_threshold, feature_threshold)

    if PCA:
        x_train_processed, eig_vec, _, _ = pca(x_train_processed, 150)
        x_train_processed = np.real(x_train_processed)
        x_test_processed = np.real(x_test_processed @ eig_vec)

    return x_train_processed, y_train_processed, x_test_processed

Process the raw datasets with predefined various threshold values.

In [4]:
# threshold percentage used to remove rows (samples) with too many NaN
row_nan = 0.5 
# threshold percentage used to remove features with too many NaN
feature_nan = 0.4 
# threshold of z score used to remove outliers
z_threshold = 2  
# threshold of percentage of outlier features used to remove samples
feature_threshold = 0.3 
x_train_processed, y_train_processed, x_test_processed = data_processing(x_train, y_train, x_test, row_nan, feature_nan, z_threshold, feature_threshold)

## Hyperparameter tuning 

We applied random search to three models: regularized logistic regression, hinge regression, and ridge regression. We tuned the hyperparameters of the models using the following ranges: $\lambda \in [0.0001, 5], \gamma \in [0.0001, 5] $. For ridge model, only $\gamma $ will be optimized. We used 5-fold cross validation and the average best f1 scores to evaluate the performance of the models. 

The best f1 score is the highest scores among all the prediction thresholds. The prediction threshold is used for converting the predicted probabilities to predicted labels. For regularized logistic regression and ridge regression model, we gave the threshold a range of [0.1, 0.9] with step size 0.1. For hinge regression model, we gave the threshold a range of [0.1, 3] with step size 0.1. The best threshold is the one that gives the highest f1 score. 

In [12]:
def random_search(params, reg_name, objective_func):
    """Random search for hyperparameter tuning"""
    best_params = None
    best_score = 0
    for i in range(50):
        param = {k: np.random.uniform(v[0], v[1]) for k, v in params.copy().items()}
        score = objective_func(param, reg_name)
        if score > best_score:
            best_score = score
            best_params = param
        print("Iteration {}: parameters = {}, score = {}".format(i, param, score))
        print("Best score = {} and best params = {}".format(best_score, best_params))
    return best_params, best_score

def cross_validation(model, x, y, k_fold=10):
    """cross-validation for regression models"""
    sub_x, sub_y =  split_cross_validation(x, y, k_fold)
    f1s = []
    losses = []
    thresholds = []

    for i in range(k_fold):
        sub_cur_x, sub_cur_y = sub_x.copy(), sub_y.copy()
        x_val, y_val = sub_cur_x.pop(i), sub_cur_y.pop(i)
        x_train, y_train = np.vstack(sub_cur_x), np.hstack(sub_cur_y)
        # data augmentation for unbalanced data
        x_train, y_train = data_augmentation(x_train, y_train)
        w, loss, best_f1, best_threshold = model(x_train, y_train, x_val, y_val)
        f1s.append(best_f1)
        losses.append(loss)
        thresholds.append(best_threshold)
    return f1s, losses, thresholds

def object_function(params, x_train, y_train, reg_name, max_iters=500, n_pat=20):
    """Objective function for random search"""
    if reg_name == 'reg_log':
        lamb = params['lamb']
        gamma = params['gamma']
        # use lambda to define the model, modify the arguments of the function
        model = lambda x_train, y_train, x_val, y_val : reg_logistic_regression_var(y_train, add_bias(x_train), y_val, add_bias(x_val),
                                                                                 lambda_=lamb, initial_w=np.random.randn(x_train.shape[1]+1) * 0.01, 
                                                                                 max_iters=max_iters, gamma=gamma, n_pat=n_pat)
    elif reg_name == 'hinge':
        lamb = params['lamb']
        gamma = params['gamma']
        model = lambda x_train, y_train, x_val, y_val : hinge_regression(y_train, x_train, y_val, x_val,
                                                                                 lambda_=lamb, initial_w=np.random.randn(x_train.shape[1]) * 0.01, 
                                                                                 max_iters=max_iters, gamma=gamma, n_pat=n_pat)
    elif reg_name == 'ridge':
        lamb = params['lamb']
        model = lambda x_train, y_train, x_val, y_val : ridge_regression_var(y_train, add_bias(x_train), y_val, add_bias(x_val), lambda_=lamb)

    # cross-validation
    f1s, losses, thresholds = cross_validation(model, x_train, y_train, k_fold=5)

    return np.mean(f1s)

obj_func = lambda params, reg_name: object_function(params, x_train_processed, y_train_processed, reg_name)
best_params = {}

#### Hinge regression:

In [13]:
params = {'lamb': [0.0001, 1], 'gamma': [0.0001, 1]}
best_params["hinge"], best_f1 = random_search(params, "hinge", obj_func)
print("Best params: ", best_params)
print("Best f1 score: ", best_f1)

Early stopping at iteration  21
Early stopping at iteration  21
Early stopping at iteration  21
Early stopping at iteration  21
Early stopping at iteration  21
Iteration 0: parameters = {'lamb': 0.9197405980398594, 'gamma': 0.6163948640348041}, score = 0.3055215445324232
Best score = 0.3055215445324232 and best params = {'lamb': 0.9197405980398594, 'gamma': 0.6163948640348041}
Early stopping at iteration  37
Early stopping at iteration  33
Early stopping at iteration  24
Early stopping at iteration  24
Early stopping at iteration  23
Iteration 1: parameters = {'lamb': 0.2803927684009535, 'gamma': 0.3434348696049686}, score = 0.4197665594513785
Best score = 0.4197665594513785 and best params = {'lamb': 0.2803927684009535, 'gamma': 0.3434348696049686}
Early stopping at iteration  60
Early stopping at iteration  38
Early stopping at iteration  48
Early stopping at iteration  22
Early stopping at iteration  48
Iteration 2: parameters = {'lamb': 0.8898737351039147, 'gamma': 0.36320209927237

#### Regularized logistic regression:

In [14]:
params = {'lamb': [0.0001, 1], 'gamma': [0.0001, 1]}
best_params["reg_log"], best_f1 = random_search(params, "reg_log", obj_func)
print("Best params: ", best_params)
print("Best f1 score: ", best_f1)

  total_loss = -np.sum(y * np.log(pred_probs) + (1 - y) * np.log(1 - pred_probs))
  total_loss = -np.sum(y * np.log(pred_probs) + (1 - y) * np.log(1 - pred_probs))


Early stopping at iteration  21
Early stopping at iteration  21
Early stopping at iteration  21
Early stopping at iteration  21
Early stopping at iteration  21
Iteration 0: parameters = {'lamb': 0.8428640078058134, 'gamma': 0.6506076419499655}, score = 0.3260682829048598
Best score = 0.3260682829048598 and best params = {'lamb': 0.8428640078058134, 'gamma': 0.6506076419499655}
Early stopping at iteration  21
Early stopping at iteration  21
Early stopping at iteration  21
Early stopping at iteration  21
Early stopping at iteration  21
Iteration 1: parameters = {'lamb': 0.8084891885149289, 'gamma': 0.6027607166015813}, score = 0.32927826222380957
Best score = 0.32927826222380957 and best params = {'lamb': 0.8084891885149289, 'gamma': 0.6027607166015813}
Early stopping at iteration  29
Early stopping at iteration  24
Early stopping at iteration  23
Early stopping at iteration  42
Early stopping at iteration  27
Iteration 2: parameters = {'lamb': 0.35757656337473886, 'gamma': 0.30939832709

#### Ridge regression:

In [15]:
params = {'lamb': [0.0001, 1]}
best_params["ridge"], best_f1 = random_search(params, "ridge", obj_func)
print("Best params: ", best_params)
print("Best f1 score: ", best_f1)

Iteration 0: parameters = {'lamb': 0.9326232038576826}, score = 0.34981217732806247
Best score = 0.34981217732806247 and best params = {'lamb': 0.9326232038576826}
Iteration 1: parameters = {'lamb': 0.26811808044418484}, score = 0.3530608662106287
Best score = 0.3530608662106287 and best params = {'lamb': 0.26811808044418484}
Iteration 2: parameters = {'lamb': 0.19905678659654344}, score = 0.35001337878986644
Best score = 0.3530608662106287 and best params = {'lamb': 0.26811808044418484}
Iteration 3: parameters = {'lamb': 0.3108396328635337}, score = 0.3467136379190978
Best score = 0.3530608662106287 and best params = {'lamb': 0.26811808044418484}
Iteration 4: parameters = {'lamb': 0.5550607324932595}, score = 0.3524183795683931
Best score = 0.3530608662106287 and best params = {'lamb': 0.26811808044418484}
Iteration 5: parameters = {'lamb': 0.8008686833484968}, score = 0.3520641899047573
Best score = 0.3530608662106287 and best params = {'lamb': 0.26811808044418484}
Iteration 6: param

All the optimal hyperparameters  are saved and can be used for the final training later.

#### Neural network:

To improve the prediction accuracy, we also implemented neural network model. This is because neural network can capture the non-linear relationship between the features and the target.