In [25]:
import pandas as pd
import wandb
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import naive_bayes
from sklearn.model_selection import train_test_split

In [2]:
train_transaction = pd.read_csv("data/fraud-detection-wandb/train_transaction.csv", low_memory=False)
train_identity = pd.read_csv("data/fraud-detection-wandb/train_identity.csv", low_memory=False)

In [4]:
train_transaction.shape

(590540, 394)

In [5]:
train_identity.shape

(144233, 41)

In [17]:
columns_to_train = [
    "TransactionID",
    "TransactionDT",
    "TransactionAmt",
    "ProductCD",
    "isFraud",
]


In [18]:
# merging
train_set = pd.merge(
    train_transaction,
    train_identity,
    on="TransactionID",
    how="outer",
)
train_set.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [19]:
train_set_to_use = train_set[columns_to_train]
train_set_to_use.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,isFraud
0,2987000,86400,68.5,W,0
1,2987001,86401,29.0,W,0
2,2987002,86469,59.0,W,0
3,2987003,86499,50.0,W,0
4,2987004,86506,50.0,H,0


In [23]:
train_set_to_use.ProductCD.replace({"W": 1, "C": 2, "R": 3, "H": 4, "S": 5}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set_to_use.ProductCD.replace({"W": 1, "C": 2, "R": 3, "H": 4, "S": 5}, inplace=True)


In [26]:
# split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    train_set_to_use.drop("isFraud", axis=1),
    train_set_to_use["isFraud"],
    test_size=0.2,
    random_state=42,
)


In [13]:
def random_forest_classifier_training(X_train, y_train, X_test, y_test):
    clf = RandomForestClassifier(n_estimators=10)
    clf.fit(X_train, y_train)

    preds = clf.predict(X_test)
    pred_prob = clf.predict_proba(X_test)
    print(metrics.classification_report(y_test, preds))

    # log with Wandb
    wandb.log({"accuracy_score": metrics.accuracy_score(y_test, preds)})
    wandb.sklearn.plot_learning_curve(clf, X_train, y_train)
    wandb.termlog("Logged learning curve.")
    wandb.sklearn.plot_confusion_matrix(y_test, preds, clf.classes_)
    wandb.termlog("Logged confusion matrix.")
    wandb.sklearn.plot_summary_metrics(
        clf, X=X_train, y=y_train, X_test=X_test, y_test=y_test
    )
    wandb.termlog("Logged summary metrics.")
    wandb.sklearn.plot_class_proportions(y_train, y_test, clf.classes_)
    wandb.termlog("Logged class proportions.")
    if not isinstance(clf, naive_bayes.MultinomialNB):
        wandb.sklearn.plot_calibration_curve(
            clf, X_train, y_train, "randomForestClassifier"
        )
    wandb.termlog("Logged calibration curve.")
    wandb.sklearn.plot_roc(y_test, pred_prob, clf.classes_)
    wandb.termlog("Logged roc curve.")
    wandb.sklearn.plot_precision_recall(y_test, pred_prob, clf.classes_)
    wandb.termlog("Logged precision recall curve.")


In [27]:
sweep_config = {
    'method': 'random', #grid, random
    'metric': {
      'name': 'accuracy_score',
      'goal': 'maximize'   
    },
    'parameters': {

        'model':{
            'values':['randomForest']
        }
    }
}
config_defaults = {

        'model' : 'randomForest'
    }
sweep_id = wandb.sweep(sweep_config)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: x2ue0h9j
Sweep URL: https://wandb.ai/destruct/uncategorized/sweeps/x2ue0h9j


In [32]:
def call_trainer():
    if wandb.config.model == 'xgboost':
        print("NA")
    if wandb.config.model == 'logistic':
        print("NA")
    if wandb.config.model == 'randomForest':
        random_forest_classifier_training(X_train, y_train, X_test, y_test)

In [33]:
def train():
    wandb.init()
    if wandb.config.model == "logistic":
        print("NA")
    if wandb.config.model == "randomForest":
        random_forest_classifier_training(X_train, y_train, X_test, y_test)
    if wandb.config.model == "xgboost":
        print("NA")


In [None]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: qvhfrue0 with config:
[34m[1mwandb[0m: 	model: randomForest
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mluba[0m ([33mdestruct[0m). Use [1m`wandb login --relogin`[0m to force relogin


              precision    recall  f1-score   support

           0       0.98      0.99      0.99    113866
           1       0.66      0.43      0.52      4242

    accuracy                           0.97    118108
   macro avg       0.82      0.71      0.75    118108
weighted avg       0.97      0.97      0.97    118108



[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision recall curve.


0,1
accuracy_score,▁

0,1
accuracy_score,0.97152


[34m[1mwandb[0m: Agent Starting Run: 3ifed1m6 with config:
[34m[1mwandb[0m: 	model: randomForest
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


              precision    recall  f1-score   support

           0       0.98      0.99      0.99    113866
           1       0.66      0.42      0.51      4242

    accuracy                           0.97    118108
   macro avg       0.82      0.71      0.75    118108
weighted avg       0.97      0.97      0.97    118108



[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision recall curve.


0,1
accuracy_score,▁

0,1
accuracy_score,0.97154


[34m[1mwandb[0m: Agent Starting Run: 0q7abvym with config:
[34m[1mwandb[0m: 	model: randomForest
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


              precision    recall  f1-score   support

           0       0.98      0.99      0.99    113866
           1       0.66      0.42      0.52      4242

    accuracy                           0.97    118108
   macro avg       0.82      0.71      0.75    118108
weighted avg       0.97      0.97      0.97    118108



[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision recall curve.


0,1
accuracy_score,▁

0,1
accuracy_score,0.97156


[34m[1mwandb[0m: Agent Starting Run: d1hahiw8 with config:
[34m[1mwandb[0m: 	model: randomForest
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


              precision    recall  f1-score   support

           0       0.98      0.99      0.99    113866
           1       0.67      0.42      0.52      4242

    accuracy                           0.97    118108
   macro avg       0.83      0.71      0.75    118108
weighted avg       0.97      0.97      0.97    118108



[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision recall curve.


0,1
accuracy_score,▁

0,1
accuracy_score,0.97194
