#### 1. Load reqruired libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns

import wandb
import params

from feature_engine.encoding import OrdinalEncoder
from sklearn.model_selection import StratifiedShuffleSplit

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
pd.pandas.set_option('display.max_columns', None)

#### 2. Load dataset

In [2]:
def load_data(data_at):
    df = pd.read_csv(data_at)
    return df

#### 3. Encoding the target variable

In [3]:
def log_data(X_train, X_valid, X_test, y_train, y_valid, y_test):
    y_train = y_train.reshape(-1, 1)
    y_valid = y_valid.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)

    train_data = pd.DataFrame(np.concatenate((X_train, y_train), axis=1))
    valid_data = pd.DataFrame(np.concatenate((X_valid, y_valid), axis=1))
    test_data = pd.DataFrame(np.concatenate((X_test, y_test), axis=1))

    train_data_at = wandb.Artifact(params.TRAIN_DATA_AT, type='train_data')
    train_data_at.add(wandb.Table(dataframe=train_data_at), 'train_data')

    valid_data_at = wandb.Artifact(params.VALID_DATA_AT, type='valid_data')
    valid_data_at.add(wandb.Table(dataframe=valid_data_at), 'valid_data')

    test_data_at = wandb.Artifact(params.TEST_DATA_AT, type='test_data')
    test_data_at.add(wandb.Table(dataframe=test_data_at), 'test_data')

    wandb.log_artifact(train_data_at)
    wandb.log_artifact(valid_data_at)
    wandb.log_artifact(test_data_at)

def preprocess_data(df):
    target_encoder = OrdinalEncoder(encoding_method='arbitrary', variables='Accident_severity')
    df = target_encoder.fit_transform(df)

    X = df.drop('Accident_severity', axis=1).values
    y = df['Accident_severity'].values

    # Initialize the StratifiedShuffleSplit object
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

    # Split for train and temp (which will be further divided into validation and test)
    for train_index, temp_index in sss.split(X, y):
        X_train, X_temp = X[train_index], X[temp_index]
        y_train, y_temp = y[train_index], y[temp_index]

    # Now split the temp data into validation and test sets
    sss_valid_test = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

    for valid_index, test_index in sss_valid_test.split(X_temp, y_temp):
        X_valid, X_test = X_temp[valid_index], X_temp[test_index]
        y_valid, y_test = y_temp[valid_index], y_temp[test_index]


    return X_train, X_valid, X_test, y_train, y_valid, y_test

#### 5. Model Training

In [4]:
def log_predictions(y_true, y_pred, name):
    df = pd.DataFrame({
        "y_true": y_true,
        "y_pred": y_pred
    })
    # Create a wandb.Table
    table = wandb.Table(dataframe=df)
    # Log the table
    wandb.log({name: table})


def log_metrics(y_true, y_pred, name):

    rmse = np.sqrt(np.mean((y_true - y_pred)**2))
    classification_report_ = classification_report(y_true, y_pred, output_dict=True)

    # Create a wandb.Table
    table = wandb.Table(dataframe=pd.DataFrame(classification_report_).transpose())
    # Log the table
    wandb.log({name: table})
    wandb.log({name + "_rmse": rmse})

In [5]:
# Define a config dictionary object
config = {
  "random_state": 2022,
    "max_depth": 2
}

def train(config):

    # WANDB RUN
    run = wandb.init(project=params.WANDB_PROJECT, entity=params.ENTITY, job_type="training-xgboost", config=config)
    config = wandb.config

    # Load the data
    df = load_data("data/RTA Dataset Transformed.csv")
    
    # Preprocess the data
    X_train, X_valid, X_test, y_train, y_valid, y_test = preprocess_data(df)
    
    # Train the model
    xgboost = xgb.XGBClassifier(random_state=wandb.config['random_state'], 
                                max_depth=wandb.config['max_depth'])
    
    xgboost = xgboost.fit(X_train, y_train)
    
    # Validation predictions
    y_pred = xgboost.predict(X_valid)
    # log the predictions
    log_predictions(y_valid, y_pred, name='valid')

    # Test predictions
    y_pred = xgboost.predict(X_test)
    # log the predictions 
    log_predictions(y_test, y_pred, name='test')

    # Log the metrics
    log_metrics(y_valid, y_pred, name='valid')
    log_metrics(y_test, y_pred, name='test')

    wandb.finish()

train(config)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkrishnatasya[0m ([33mblack-order[0m). Use [1m`wandb login --relogin`[0m to force relogin


0,1
test_rmse,▁
valid_rmse,▁

0,1
test_rmse,0.43019
valid_rmse,0.44594
