In [None]:
import os
os.environ["LC_ALL"] = "C.UTF-8"
os.environ["LANG"] = "C.UTF-8"
os.environ["PYTHONIOENCODING"] = "utf-8"

In [None]:
import cupy as cp
print("CuPy Version:", cp.__version__)
print("CUDA Available:", cp.cuda.is_available())
print("GPU Compute Capability:", cp.cuda.Device(0).compute_capability)

In [None]:
!apt-get update && apt-get install -y locales
!locale-gen en_US.UTF-8
!update-locale LANG=en_US.UTF-8

In [None]:
!pip install -q "cuda-python<13.0a0,>=12.0" "numba>=0.59.1,<0.60"

In [None]:
!pip install -q "cupy-cuda12x" "cudf-cu12" "cuml-cu12" "rmm-cu12" "pylibraft-cu12" "pylibcudf-cu12" \
             "dask" "dask-cuda" "dask-ml" -f https://pypi.nvidia.com


In [None]:
!pip install -q pandas numpy scipy scikit-learn

In [None]:
import cupy as cp
import cudf
import cuml

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from cuml.svm import SVC
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from cuml.model_selection import StratifiedKFold
from cuml.preprocessing import LabelEncoder

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
print("Preferred Encoding:", locale.getpreferredencoding())

In [None]:
file_path = "loan_data.csv.csv"

In [None]:
loan_dataset_gpu = cudf.read_csv(file_path)

In [None]:
grade_mapping = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'E': 4,
    'F': 5,
    'G': 6
}

loan_dataset_gpu['grade'] = loan_dataset_gpu['grade'].map(grade_mapping)

In [None]:
loan_dataset_gpu['grade'].value_counts()

In [None]:
loan_dataset_gpu = loan_dataset_gpu.drop(columns=['sub_grade', 'id'])

In [None]:
loan_dataset_gpu_processed = loan_dataset_gpu.copy()

In [None]:
loan_dataset_gpu_processed.head()

In [None]:
y = loan_dataset_gpu_processed["grade"]
X = loan_dataset_gpu_processed.drop(columns=['grade'])

In [None]:
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = X.select_dtypes(include=['object']).columns


In [None]:
import cupy as cp
import cudf
from cuml.preprocessing import StandardScaler
from cuml.decomposition import PCA
from imblearn.over_sampling import SMOTE  # CPU-based

X = cudf.get_dummies(X, columns=categorical_columns)

bool_columns = X.select_dtypes(include=['bool']).columns
X[bool_columns] = X[bool_columns].astype(int)

numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
numerical_indices = [X.columns.get_loc(col) for col in numerical_columns]


X = cp.array(X.to_numpy())
y = cp.array(y.to_numpy()).ravel()


num_samples = X.shape[0]
split_idx = int(num_samples * 0.8)

X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]


X_train_cpu = cp.asnumpy(X_train)
y_train_cpu = cp.asnumpy(y_train)

smote = SMOTE(random_state=42)
X_train_resampled_cpu, y_train_resampled_cpu = smote.fit_resample(X_train_cpu, y_train_cpu)

X_train_resampled = cp.array(X_train_resampled_cpu)
y_train_resampled = cp.array(y_train_resampled_cpu)


scaler = StandardScaler()




X_train_resampled[:, numerical_indices] = scaler.fit_transform(X_train_resampled[:, numerical_indices])
X_test[:, numerical_indices] = scaler.transform(X_test[:, numerical_indices])

In [None]:
X_train_resampled = cudf.DataFrame(X_train_resampled, columns=numerical_columns)
X_test = cudf.DataFrame(X_test, columns=numerical_columns)


In [None]:
from cuml.decomposition import PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train_resampled)
X_test_pca = pca.transform(X_test)

In [None]:


feature_columns = numerical_columns
pca_components_np = pca.components_.to_numpy()
explained_variance_np = pca.explained_variance_ratio_.to_numpy()

feature_importance_df = pd.DataFrame(
    data=pca_components_np.T,
    index=feature_columns,
    columns=[f'PC{i+1}' for i in range(pca_components_np.shape[0])]
)
feature_importance = feature_importance_df.abs().sum(axis=1)

In [None]:
top_40_features = list(feature_importance.nlargest(40).index)
print(top_40_features)

In [None]:
X_train_resampled = X_train_resampled[top_40_features]
X_test = X_test[top_40_features]

In [None]:
X_train_cudf = cudf.DataFrame(X_train_resampled)
X_test_cudf = cudf.DataFrame(X_test)

X_train_bal_cp =  X_train_cudf.to_cupy().astype(cp.float32)
X_test_bal_cp = X_test_cudf.to_cupy().astype(cp.float32)

In [None]:
y_train_cudf = cudf.Series(y_train_resampled)
y_test_cudf = cudf.Series(y_test)

y_train_bal_cp = y_train_cudf.to_cupy()

y_test_bal_cp = y_test_cudf.to_cupy()

In [None]:
num_folds_bal = 10
kf_bal = StratifiedKFold(n_splits=num_folds_bal, shuffle=True)
scores_bal = []
log_loss_bal = []

In [None]:
from cuml.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize
from sklearn.metrics import log_loss
from scipy.special import softmax


pca_bal_svm_model = SVC(kernel="rbf",  C=1.0, class_weight="balanced", gamma='scale')
pca_bal_svm_gpu = OneVsRestClassifier(pca_bal_svm_model)

for train_index_bal, val_index_bal in kf_bal.split(X_train_bal_cp, y_train_bal_cp):
    X_train_fold_bal, X_val_fold_bal = X_train_bal_cp[train_index_bal], X_train_bal_cp[val_index_bal]
    y_train_fold_bal, y_val_fold_bal = y_train_bal_cp[train_index_bal], y_train_bal_cp[val_index_bal]



    pca_bal_svm_gpu.fit(X_train_fold_bal, y_train_fold_bal)
    y_val_fold_bal = y_val_fold_bal.astype(int)

    y_val_fold_scores = pca_bal_svm_gpu.decision_function(X_val_fold_bal)
    y_val_fold_scores_np = y_val_fold_scores.get()
    y_val_fold_prob = softmax(y_val_fold_scores_np, axis=1)
    y_val_fold_bal_np = y_val_fold_bal.get() if hasattr(y_val_fold_bal, "get") else np.array(y_val_fold_bal)
    n_classes = len(np.unique(y_val_fold_bal))
    y_val_fold_bal_bin = label_binarize( y_val_fold_bal_np, classes=np.arange(n_classes))
    log_loss_value = log_loss(y_val_fold_bal_bin, y_val_fold_prob)
    log_loss_bal.append(log_loss_value)


    accuracy_bal = pca_bal_svm_gpu.score(X_val_fold_bal, y_val_fold_bal)
    scores_bal.append(accuracy_bal)

    print(f"Fold Accuracy: {accuracy_bal:.4f}" , f"Fold Log Loss:{log_loss_value:.4f}")


print(f"\nMean CV Accuracy: {np.mean(scores_bal):.4f}")

In [None]:
y_pred_bal = pca_bal_svm_gpu.predict(X_test_bal_cp)
y_pred_bal = cudf.Series(y_pred_bal)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
y_pred_np_bal = y_pred_bal.to_pandas().to_numpy()
y_test_np_bal = y_test_bal_cp.get()
cm_bal = confusion_matrix(y_test_np_bal, y_pred_np_bal)

In [None]:
print(cm_bal)

In [None]:
from sklearn.metrics import classification_report
report_bal = classification_report(y_test_np_bal, y_pred_np_bal, digits=4)
print(report_bal)

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import label_binarize

y_test_np_bal = y_test_np_bal.astype(int)
y_pred_prob_bal = pca_bal_svm_gpu.decision_function(X_test_bal_cp)
n_classes = len(np.unique(y_test_np_bal))

y_test_bin = label_binarize(y_test_np_bal, classes=np.arange(n_classes))
plt.figure(figsize=(8,6))
for i in range(n_classes):
    precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_pred_prob_bal.get()[:, i])
    plt.plot(recall, precision, label=f'Class {i}')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()