In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py


In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

# Memory reduction function
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(f'Mem. usage decreased to {end_mem:.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df

# Combined preprocessing and UID detection
def preprocess_and_detect_uid(train_df, test_df, train_identity=None, test_identity=None):
    print("Starting preprocessing...")

    # Memory reduction
    train_df = reduce_mem_usage(train_df)
    test_df = reduce_mem_usage(test_df)

    # Merge with identity datasets if provided
    if train_identity is not None and test_identity is not None:
        train_df = pd.merge(train_df, train_identity, on="TransactionID", how="left")
        test_df = pd.merge(test_df, test_identity, on="TransactionID", how="left")

    # Add full address feature
    train_df['full_addr'] = train_df['addr1'].astype(str) + '_' + train_df['addr2'].astype(str)
    test_df['full_addr'] = test_df['addr1'].astype(str) + '_' + test_df['addr2'].astype(str)

    # Add time-based UID features
    for col in ['D1', 'D2', 'D3', 'D5', 'D10', 'D11', 'D15']:
        if col in train_df.columns:
            new_col = 'uid_td_' + col
            train_df[new_col] = train_df['TransactionDT'] / (24 * 60 * 60)
            train_df[new_col] = np.floor(train_df[new_col] - train_df[col]) + 1000

            test_df[new_col] = test_df['TransactionDT'] / (24 * 60 * 60)
            test_df[new_col] = np.floor(test_df[new_col] - test_df[col]) + 1000

    # Add normalized day feature
    train_df['DT_day'] = np.floor(train_df['TransactionDT'] / (24 * 60 * 60)) + 1000
    test_df['DT_day'] = np.floor(test_df['TransactionDT'] / (24 * 60 * 60)) + 1000

    # Round transaction amounts for feature creation
    train_df['TransactionAmt_fix'] = np.round(train_df['TransactionAmt'], 2)
    test_df['TransactionAmt_fix'] = np.round(test_df['TransactionAmt'], 2)

    # UID detection
    train_df['uid'] = train_df['card1'].astype(str) + '_' + train_df['card2'].astype(str) + '_' + train_df['addr1'].astype(str)
    test_df['uid'] = test_df['card1'].astype(str) + '_' + test_df['card2'].astype(str) + '_' + test_df['addr1'].astype(str)

    # Frequency encoding for UID
    uid_freq = pd.concat([train_df['uid'], test_df['uid']]).value_counts()
    train_df['uid_count'] = train_df['uid'].map(uid_freq)
    test_df['uid_count'] = test_df['uid'].map(uid_freq)

    print("Preprocessing and UID detection complete.")
    return train_df, test_df





In [2]:
train_df = pd.read_csv('drive/MyDrive/DSC_final_project/train_transaction.csv')
train_identity = pd.read_csv('drive/MyDrive/DSC_final_project/train_identity.csv')
test_df = pd.read_csv('drive/MyDrive/DSC_final_project/test_transaction.csv')
test_identity = pd.read_csv('drive/MyDrive/DSC_final_project/test_identity.csv')

In [3]:
train_processed, test_processed = preprocess_and_detect_uid(train_df, test_df, train_identity, test_identity)

Starting preprocessing...
Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 472.59 Mb (68.9% reduction)
Preprocessing and UID detection complete.


In [5]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np
import scipy.sparse as sp

def preprocess_for_baseline_with_onehot_sparse(input_train_df, input_test_df, target_column='IsFraud'):
    # Work on local copies to avoid modifying global variables
    train_df = input_train_df.copy()
    test_df = input_test_df.copy()

    # Drop target column from training data
    if target_column in train_df.columns:
        train_df = train_df.drop(columns=[target_column])

    # Identify numeric and categorical columns that exist in the datasets
    numeric_columns = train_df.select_dtypes(include=['int64', 'float64']).columns
    categorical_columns = train_df.select_dtypes(include=['object']).columns

    # Ensure the columns exist in both train and test
    numeric_columns = [col for col in numeric_columns if col in test_df.columns]
    categorical_columns = [col for col in categorical_columns if col in test_df.columns]

    # Handle missing values
    train_df[numeric_columns] = train_df[numeric_columns].fillna(train_df[numeric_columns].median())
    test_df[numeric_columns] = test_df[numeric_columns].fillna(test_df[numeric_columns].median())
    train_df[categorical_columns] = train_df[categorical_columns].fillna('missing')
    test_df[categorical_columns] = test_df[categorical_columns].fillna('missing')

    # OneHotEncoder for categorical columns
    if categorical_columns:
        ohe = OneHotEncoder(handle_unknown='ignore')  # Use sparse matrices
        train_categorical_sparse = ohe.fit_transform(train_df[categorical_columns])
        test_categorical_sparse = ohe.transform(test_df[categorical_columns])

        # Save memory by retaining sparse matrices
        train_df = train_df.drop(columns=categorical_columns)
        test_df = test_df.drop(columns=categorical_columns)

    # Standardize numeric columns
    if numeric_columns:
        scaler = StandardScaler()
        train_df[numeric_columns] = scaler.fit_transform(train_df[numeric_columns])
        test_df[numeric_columns] = scaler.transform(test_df[numeric_columns])

    # Convert numeric columns to sparse format and concatenate
    if numeric_columns:
        train_numeric_sparse = sp.csr_matrix(train_df[numeric_columns].values.astype(np.float32))
        test_numeric_sparse = sp.csr_matrix(test_df[numeric_columns].values.astype(np.float32))

        train_sparse = sp.hstack([train_numeric_sparse, train_categorical_sparse], format='csr')
        test_sparse = sp.hstack([test_numeric_sparse, test_categorical_sparse], format='csr')
    else:
        train_sparse = train_categorical_sparse
        test_sparse = test_categorical_sparse

    return train_sparse, test_sparse


In [8]:
!pip install optuna



In [8]:
prepocessed_train, prepocessed_test = preprocess_for_baseline_with_onehot_sparse(train_processed, test_processed)

In [None]:
import optuna
import cupy as cp
import cupyx.scipy.sparse as cps
from cuml.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Target variable
y_train = train_processed['isFraud']

# Objective function for Optuna
def objective(trial):
    # Define hyperparameters to tune
    param = {
        "C": trial.suggest_loguniform("C", 1e-4, 1e2),  # Regularization strength
        "solver": "qn",  # Use quasi-Newton solver (cuML only supports this)
        "penalty": "l2",  # L2 regularization only
        "class_weight": "balanced",  # Handle imbalanced data
        "max_iter": 10000,
        "tol": 1e-4
    }

    # Logistic Regression using cuML (GPU)
    model = LogisticRegression(**param)

    # Stratified K-Fold Cross-Validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    log_losses = []

    for train_idx, val_idx in cv.split(prepocessed_train, y_train):
        # Convert to GPU sparse matrices
        X_train = cps.csr_matrix(prepocessed_train[train_idx])
        X_val = cps.csr_matrix(prepocessed_train[val_idx])
        y_train_fold = cp.array(y_train.iloc[train_idx])
        y_val_fold = cp.array(y_train.iloc[val_idx])

        # Train the model
        model.fit(X_train, y_train_fold)

        # Predict probabilities
        y_val_pred_proba = model.predict_proba(X_val)[:, 1]

        # Calculate log loss
        log_losses.append(log_loss(y_val_fold.get(), y_val_pred_proba.get()))
        print(np.mean(log_losses))
    # Return mean log loss
    return np.mean(log_losses)

# Optuna study
study = optuna.create_study(direction="minimize", study_name="GPU Logistic Regression Tuning")
study.optimize(objective, n_trials=20)

# Best trial parameters
print("Best trial:")
print(study.best_trial.params)

# Cross-validation with Best Parameters
best_params = study.best_trial.params
model = LogisticRegression(**best_params)

[I 2024-12-06 22:48:29,737] A new study created in memory with name: GPU Logistic Regression Tuning


0.34131738696261177
0.34239190083931176
0.3419132316833835
0.3426304954744556


[I 2024-12-06 22:49:01,933] Trial 0 finished with value: 0.34279517543590854 and parameters: {'C': 1.01974693075021}. Best is trial 0 with value: 0.34279517543590854.


0.34279517543590854
0.5423706705689512
0.5436765141459714
0.5418103523528651
0.5421215506057802


[I 2024-12-06 22:49:04,745] Trial 1 finished with value: 0.542082529078044 and parameters: {'C': 0.0003798170803813578}. Best is trial 0 with value: 0.34279517543590854.


0.542082529078044
0.524332550162172
0.5255865391604699
0.5238089754980525
0.5241377068823354


[I 2024-12-06 22:49:08,835] Trial 2 finished with value: 0.5240172889562312 and parameters: {'C': 0.0015206657059152293}. Best is trial 0 with value: 0.34279517543590854.


0.5240172889562312
0.5530617154143186
0.5543585477337452
0.5525429869472152
0.5528422895021006


[I 2024-12-06 22:49:10,834] Trial 3 finished with value: 0.5528319916233693 and parameters: {'C': 0.00012380435805592387}. Best is trial 0 with value: 0.34279517543590854.


0.5528319916233693
0.3323631435453769
0.33329342072318413
0.33281221895952334
0.3334859214261799


[I 2024-12-06 22:49:51,893] Trial 4 finished with value: 0.3337574715007993 and parameters: {'C': 3.2630018283171065}. Best is trial 4 with value: 0.3337574715007993.


0.3337574715007993
0.459134745275502
0.46049079493125533
0.45902838284708086
0.4594165649148779


[I 2024-12-06 22:50:00,067] Trial 5 finished with value: 0.4591334236035164 and parameters: {'C': 0.018725874984155462}. Best is trial 4 with value: 0.3337574715007993.


0.4591334236035164
0.5435457388690567
0.5448655813372146
0.5430198807671651
0.5433453850074158


[I 2024-12-06 22:50:02,705] Trial 6 finished with value: 0.5433286683091199 and parameters: {'C': 0.0003339021920351738}. Best is trial 4 with value: 0.3337574715007993.


0.5433286683091199
0.4029764542959232
0.40401890991743944
0.40304004648185604
0.4036594683626384


[I 2024-12-06 22:50:16,589] Trial 7 finished with value: 0.40346242556842765 and parameters: {'C': 0.08780659878691065}. Best is trial 4 with value: 0.3337574715007993.


0.40346242556842765
0.44020551772909233
0.44134749653267913
0.43987112851762494
0.44048108915888656


[I 2024-12-06 22:50:26,828] Trial 8 finished with value: 0.44022465727124027 and parameters: {'C': 0.03179939398625025}. Best is trial 4 with value: 0.3337574715007993.


0.44022465727124027
0.5421813597768301
0.5435136789509618
0.5416640691871586
0.5419964967591778


[I 2024-12-06 22:50:29,698] Trial 9 finished with value: 0.5419718725590539 and parameters: {'C': 0.0003839211814373898}. Best is trial 4 with value: 0.3337574715007993.


0.5419718725590539
0.33213982623007443
0.33245041836116185
0.3321308975505203
0.3328353203993377


[I 2024-12-06 22:51:29,612] Trial 10 finished with value: 0.33314724297784404 and parameters: {'C': 40.864817467513994}. Best is trial 10 with value: 0.33314724297784404.


0.33314724297784404
0.3320909269962667
0.33244488823492835
0.3324241051195297
0.3331939808200488


[I 2024-12-06 22:52:32,239] Trial 11 finished with value: 0.3335528535729252 and parameters: {'C': 56.958407270148115}. Best is trial 10 with value: 0.33314724297784404.


0.3335528535729252
0.3322659734684514
0.3330359640542784
0.33268020786901703
0.3332435467895212


[I 2024-12-06 22:53:40,853] Trial 12 finished with value: 0.33378008942094745 and parameters: {'C': 78.56796146426251}. Best is trial 10 with value: 0.33314724297784404.


0.33378008942094745
0.33155518042842486
0.33205988317720786
0.3317900519920378
0.33247565169674076


[I 2024-12-06 22:54:35,997] Trial 13 finished with value: 0.3329529804087489 and parameters: {'C': 42.85890095129514}. Best is trial 13 with value: 0.3329529804087489.


0.3329529804087489
0.33095166285936584
0.33180816948140246
0.33149857885943124
0.3322780237950773


[I 2024-12-06 22:55:22,972] Trial 14 finished with value: 0.33258721210939185 and parameters: {'C': 4.485989414870252}. Best is trial 14 with value: 0.33258721210939185.


0.33258721210939185
0.33100999481912097
0.33208499033060634
0.33169108653888135
0.3324039419569283


[I 2024-12-06 22:56:06,486] Trial 15 finished with value: 0.3327363756026286 and parameters: {'C': 4.373708550421695}. Best is trial 14 with value: 0.33258721210939185.


0.3327363756026286
0.33304114828307774
0.33402101642539
0.33381573982657425
0.33466190273304497


[I 2024-12-06 22:56:45,657] Trial 16 finished with value: 0.3349150317571402 and parameters: {'C': 2.521610413748298}. Best is trial 14 with value: 0.33258721210939185.


0.3349150317571402
0.33027674084955
0.3309702902125936
0.33071998484902454
0.3314977431849833


[I 2024-12-06 22:57:32,568] Trial 17 finished with value: 0.3318781557968956 and parameters: {'C': 7.523470284010317}. Best is trial 17 with value: 0.3318781557968956.


0.3318781557968956
0.3678279997668923
0.36903353472062717
0.3685479855929628
0.3692728424075452


[I 2024-12-06 22:57:53,701] Trial 18 finished with value: 0.36926760198165354 and parameters: {'C': 0.26361324769032213}. Best is trial 17 with value: 0.3318781557968956.


0.36926760198165354
0.33057453024747585
0.3310828940586367
0.3307358527172471
0.33150242241198435


[I 2024-12-06 22:58:46,346] Trial 19 finished with value: 0.33191341496367394 and parameters: {'C': 12.121541808000616}. Best is trial 17 with value: 0.3318781557968956.


0.33191341496367394
Best trial:
{'C': 7.523470284010317}


In [10]:
# Cross-validation with Best Parameters
best_params = study.best_trial.params
model = LogisticRegression(**best_params)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

auc_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
log_losses = []

for train_idx, val_idx in cv.split(prepocessed_train, y_train):
    X_train = cps.csr_matrix(prepocessed_train[train_idx])
    X_val = cps.csr_matrix(prepocessed_train[val_idx])
    y_train_fold = cp.array(y_train.iloc[train_idx])
    y_val_fold = cp.array(y_train.iloc[val_idx])

    # Train with best parameters
    model.fit(X_train, y_train_fold)

    # Predict probabilities and classes
    y_val_pred_proba = model.predict_proba(X_val)[:, 1]
    y_val_pred = (y_val_pred_proba > 0.5).astype(int)  # Threshold for binary classification

    # Metrics
    log_losses.append(log_loss(y_val_fold.get(), y_val_pred_proba.get()))
    auc_scores.append(roc_auc_score(y_val_fold.get(), y_val_pred_proba.get()))
    f1_scores.append(f1_score(y_val_fold.get(), y_val_pred.get()))
    precision_scores.append(precision_score(y_val_fold.get(), y_val_pred.get()))
    recall_scores.append(recall_score(y_val_fold.get(), y_val_pred.get()))

# Print Final Metrics
print("Final Cross-Validation Metrics with Best Parameters:")
print(f"Mean Log Loss: {np.mean(log_losses):.4f}")
print(f"Mean AUC Score: {np.mean(auc_scores):.4f}")
print(f"Mean F1 Score: {np.mean(f1_scores):.4f}")
print(f"Mean Precision: {np.mean(precision_scores):.4f}")
print(f"Mean Recall: {np.mean(recall_scores):.4f}")


[W] [23:00:34.943146] L-BFGS: max iterations reached
[W] [23:00:34.944683] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:00:41.482084] L-BFGS: max iterations reached
[W] [23:00:41.483473] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:00:47.090894] L-BFGS: max iterations reached
[W] [23:00:47.092088] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [23:00:53.070335] L-BFGS: max iterations reached
[W] [23:00:53.071594] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the inpu

In [29]:
from scipy.sparse import save_npz, load_npz

# Save sparse matrices
save_npz('drive/MyDrive/DSC_final_project/train_sparse_BASELINE_LOGISTIC.npz', prepocessed_train)
save_npz('drive/MyDrive/DSC_final_project/test_sparse_BASELINE_LOGISTIC.npz', prepocessed_test)

# Load sparse matrices
train_sparse = load_npz('drive/MyDrive/DSC_final_project/train_sparse_BASELINE_LOGISTIC.npz')
test_sparse = load_npz('drive/MyDrive/DSC_final_project/test_sparse_BASELINE_LOGISTIC.npz')


In [18]:
y_pred = model.predict_proba(cps.csr_matrix(prepocessed_test))

In [None]:
submission = pd.DataFrame({
    "TransactionID": test_df["TransactionID"],  # Replace with your test set's TransactionID column
    "isFraud": predicted_proba  # Convert CuPy array to NumPy
})

# Save to CSV
submission.to_csv("drive/MyDrive/DSC_final_project/log_reg_initial_submission.csv", index=False)