# Basic Cost function

## Data Preparation 

In [1]:
import pandas as pd
import numpy as np
import matplotlib

In [43]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [44]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [45]:
x_total = pd.get_dummies(train.drop('Name',axis=1))

In [46]:
from sklearn.model_selection import train_test_split
# Train test validation Split
labels=x_total['Survived']
X_total= x_total.drop(['Survived'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X_total, labels, test_size=0.20, random_state=42)

In [47]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Ticket_110152,Ticket_110413,...,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
331,332,1,45.5,0,0,28.5,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
733,734,2,23.0,0,0,13.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
382,383,3,32.0,0,0,7.925,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
704,705,3,26.0,1,0,7.8542,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
813,814,3,6.0,4,2,31.275,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [48]:
y_train.head()

331    0
733    0
382    0
704    0
813    0
Name: Survived, dtype: int64

## lgbm 

lgbm or other boosting methods use the parameter eval_metric to select the loss metric.

- rmse: square root of mean square error
- mae: mean absolute error
- logloss: cross entropy
- error: wrong samples over all samples for 2 classes
- merror: wrong samples over all samples for more than 2 classes
- mlogloss: cross entropy for more than 2 classes
- auc: area under the ROC curve

The ROC curve is created by plotting the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings.

In [79]:
import lightgbm

# default lightgbm model with sklearn api
gbm = lightgbm.LGBMClassifier() 

# fitting model 
gbm.fit(X_train,y_train,eval_metric='error')

y_pred = gbm.predict(X_test)

In [88]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, auc, roc_curve

def performance(pred,real):
    cmatrix1 = confusion_matrix(pred, real)
    print(cmatrix1)
    accuracy = accuracy_score(pred, real)
    recall = recall_score(pred, real)
    precision = precision_score(pred, real)
    f1 = f1_score(pred, real)
    fpr, tpr, thresholds = roc_curve(real, pred)
    auc_value= auc(fpr,tpr)
    print('accuracy: ', accuracy)
    print('precision: ', precision)
    print('recall: ', recall)
    print('F1: ', f1)
    print('auc: ', auc_value)

In [89]:
performance(y_pred,y_test)

[[88 19]
 [17 55]]
accuracy:  0.7988826815642458
precision:  0.7432432432432432
recall:  0.7638888888888888
F1:  0.7534246575342465
auc:  0.7906692406692406


In [90]:
gbm = lightgbm.LGBMClassifier() 

# fitting model 
gbm.fit(X_train,y_train,eval_metric='auc')

y_pred = gbm.predict(X_test)
performance(y_pred,y_test)

[[88 19]
 [17 55]]
accuracy:  0.7988826815642458
precision:  0.7432432432432432
recall:  0.7638888888888888
F1:  0.7534246575342465
auc:  0.7906692406692406


# Self-defined function

## lgbm and boosting 

1. training loss: In LightGBM, we need to define a function to calculate gradient and hessian values by ourselves. The function needs y_true and y_pred as inputs and outputs the gradient and hessian. 


2. validation loss or test loss: In lgbm, we need to define a function to calculate the average of loss. The function outputs three values: requested string, average loss and a bool value to judge the state. 

In [91]:
def custom_asymmetric_train(y_true, y_pred):
    residual = (y_true - y_pred).astype("float")
    grad = np.where(residual<0, -2*10.0*residual, -2*residual)
    hess = np.where(residual<0, 2*10.0, 2.0)
    return grad, hess

def custom_asymmetric_valid(y_true, y_pred):
    residual = (y_true - y_pred).astype("float")
    loss = np.where(residual < 0, (residual**2)*10.0, residual**2) 
    return "custom_asymmetric_eval", np.mean(loss), False

In [92]:
import lightgbm

# default lightgbm model with sklearn api
gbm = lightgbm.LGBMClassifier() 

# updating objective function to custom
# default is "regression"
# also adding metrics to check different scores
gbm.set_params(**{'objective': custom_asymmetric_train}, metrics = ["mse", 'mae'])

# fitting model 
gbm.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    eval_metric=custom_asymmetric_valid,
    verbose=False,
)

y_pred = gbm.predict(X_test)

In [93]:
performance(y_pred,y_test)

[[101  37]
 [  4  37]]
accuracy:  0.770949720670391
precision:  0.5
recall:  0.9024390243902439
F1:  0.6434782608695653
auc:  0.7309523809523809


## xgboost 

In [96]:
import xgboost as xgb
def customObj1(real, predict):
    grad = predict - real
    hess = np.power(np.abs(grad), 0.5)
    return grad, hess

- use the parameter objective to define your own loss function 

In [99]:
model = xgb.XGBClassifier(objective=customObj1,
                         booster="gblinear")

In [100]:
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
performance(y_pred,y_test)

[[102  54]
 [  3  20]]
accuracy:  0.6815642458100558
precision:  0.2702702702702703
recall:  0.8695652173913043
F1:  0.41237113402061853
auc:  0.6208494208494207
