# Example for creating Submission CSV

In [2]:
import pandas as pd
import cudf # GPU-accelerated dataframe library
import cuml # GPU-accelerated machine learning library
from cuml.linear_model import LogisticRegression # GPU-accelerated logistic regression
from cuml.preprocessing import StandardScaler # GPU-accelerated standard scaler
from cuml.metrics import accuracy_score # GPU-accelerated accuracy score
import optuna # Hyperparameter optimization framework

## Setup Train/Test Data

In [3]:
# Retrieve Training and Testing Data 
df_train = pd.read_csv('data/trainClean.csv')
df_test = pd.read_csv('data/testClean.csv')

## Setup features and target

In [4]:
# Convert pandas dataframes to cudf dataframes
df_train = cudf.from_pandas(df_train)
df_test = cudf.from_pandas(df_test)

X_train = df_train.drop(columns='Transported')
y_train = df_train['Transported']

## Use Algorithm

In [6]:
# Create a StandardScaler object and fit it on the training data
scaler = StandardScaler()
scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = scaler.transform(X_train)

def objective(trial):
    # Define hyperparameters to optimize
    C = trial.suggest_loguniform('C', 1e-5, 1e2) # Inverse of regularization strength
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2']) # Regularization type
    l1_ratio = trial.suggest_uniform('l1_ratio', 0, 1) # Ratio of L1 to L2 regularization
    
    # Create LogisticRegression object with suggested hyperparameters
    logreg = LogisticRegression(
        C=C,
        penalty=penalty,
        l1_ratio=l1_ratio,
        fit_intercept=True,
        max_iter=100
    )

    # Perform cross-validation with the LogisticRegression model
    scores = cross_val_score(logreg, X_train_scaled, y_train, cv=5, n_jobs=-1).mean()
    
    return scores.mean()  # Return the mean score from cross-validation

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # You can adjust the number of trials

# Print best hyperparameters and best score
best_params = study.best_params
best_score = study.best_value
print("Best parameters found:", best_params)
print("Best score found:", best_score)



[I 2023-11-20 19:57:56,405] A new study created in memory with name: no-name-56d84a64-be21-4940-9ac6-4f42fa73dbee
  C = trial.suggest_loguniform('C', 1e-5, 1e2) # Inverse of regularization strength
  l1_ratio = trial.suggest_uniform('l1_ratio', 0, 1) # Ratio of L1 to L2 regularization
[W 2023-11-20 19:57:56,406] Trial 0 failed with parameters: {'C': 1.161289510337848, 'penalty': 'l1', 'l1_ratio': 0.7533942836393356} because of the following error: NameError("name 'LogisticRegression' is not defined").
Traceback (most recent call last):
  File "C:\Users\Matt\AppData\Roaming\Python\Python311\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Matt\AppData\Local\Temp\ipykernel_13436\2413358181.py", line 11, in objective
    logreg = LogisticRegression(
             ^^^^^^^^^^^^^^^^^^
NameError: name 'LogisticRegression' is not defined
[W 2023-11-20 19:57:56,410] Trial 0 failed with value Non

NameError: name 'LogisticRegression' is not defined

## Create CSV containing predictions

In [6]:
tfMap = { 0: False, 1: True }

kaggleCSV = pd.DataFrame(columns=['PassengerId', 'Transported'])
kaggleCSV['PassengerId'] = df_test['PassengerId']
kaggleCSV['Transported'] = [tfMap[i] for i in pred]

kaggleCSV.to_csv('results/example_results.csv', index=False)