<a href="https://colab.research.google.com/github/Keita-S593/rindoku/blob/main/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

参考: https://www.kaggle.com/code/kawakeee/titanic-lightgbm

# インポート

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score
import lightgbm as lgb

# 学習データ読み込み

In [None]:
df = pd.read_csv('/content/drive/MyDrive/輪読/titanic/train.csv')

# 前処理

In [None]:
def preprocess(df):
    df.replace({'Sex':{"male":0, "female":1}, 'Embarked':{"S":int(0), "C":int(1), "Q":int(2)}}, inplace=True)
    return df

In [None]:
df = preprocess(df)

In [None]:
features_col = ["Pclass","Age","Sex","Fare", "SibSp", "Parch", "Embarked"]
X = df[features_col]
y = df['Survived']

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=100, stratify=y)

# 学習

In [None]:
def each_threshold(y, y_pred):
    ths = []
    recalls = []
    precisions = []
    f1s = []
    for th_int in range(1, 99):
        th = th_int / 100. 
        y_bin = (y_pred > th).astype(int)
        tn, fp, fn, tp = confusion_matrix(y, y_bin).flatten()

        rec = tp / (tp + fn + 1e-16)
        pre = tp / (tp + fp + 1e-16)

        ths.append(th)
        recalls.append(rec)
        precisions.append(pre)
        f1s.append(rec * pre / (rec + pre + 1e-16))


    return np.array(ths), np.array(recalls), np.array(precisions), np.array(f1s)

def max_score_with_thre(ths, scores):
    return np.max(scores), ths[(np.argmax(scores))]

def metrics(y, y_pred):
    roc_auc = roc_auc_score(y, y_pred)

    ths, recalls, precisions, f1s = each_threshold(y, y_pred)

    recall, recall_th = max_score_with_thre(ths, recalls)
    precision, precision_th = max_score_with_thre(ths, precisions)
    f1, f1_th = max_score_with_thre(ths, f1s)

    f1mat = confusion_matrix(y, (y_pred > f1_th).astype(int))

    print(f"Max recall is {recall} at  threshold {recall_th}.")
    print(f"Max precision is {precision} at  threshold {precision_th}.")
    print(f"Max f1 is {f1} at  threshold {f1_th}.")
    print("roc auc:", roc_auc)
    print("f1-based confusion matrix:\n", f1mat)

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid)
 
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 200,
    'learning_rate': 0.003,
    'num_iterations':100,
    'feature_fraction': 0.52,
    'bagging_fraction': 0.79,
    'bagging_freq': 7,
    'verbose': 0
}

cat_list = ["Pclass","Sex", "SibSp", "Parch", "Embarked"]

lgb_clf = lgb.train(params,
                lgb_train,
                num_boost_round=5000,
                valid_sets=lgb_eval,
                early_stopping_rounds=1000, 
                categorical_feature=cat_list)

[1]	valid_0's auc: 0.661528
Training until validation scores don't improve for 1000 rounds.
[2]	valid_0's auc: 0.687879
[3]	valid_0's auc: 0.869763
[4]	valid_0's auc: 0.847892
[5]	valid_0's auc: 0.825494
[6]	valid_0's auc: 0.823123
[7]	valid_0's auc: 0.805995
[8]	valid_0's auc: 0.806588
[9]	valid_0's auc: 0.794466
[10]	valid_0's auc: 0.828524
[11]	valid_0's auc: 0.821146
[12]	valid_0's auc: 0.841436
[13]	valid_0's auc: 0.853557
[14]	valid_0's auc: 0.86502
[15]	valid_0's auc: 0.861331
[16]	valid_0's auc: 0.860408
[17]	valid_0's auc: 0.858037
[18]	valid_0's auc: 0.851713
[19]	valid_0's auc: 0.852108
[20]	valid_0's auc: 0.850264
[21]	valid_0's auc: 0.861858
[22]	valid_0's auc: 0.85527
[23]	valid_0's auc: 0.850922
[24]	valid_0's auc: 0.859618
[25]	valid_0's auc: 0.868841
[26]	valid_0's auc: 0.858827
[27]	valid_0's auc: 0.854875
[28]	valid_0's auc: 0.852635
[29]	valid_0's auc: 0.858169
[30]	valid_0's auc: 0.865942
[31]	valid_0's auc: 0.87332
[32]	valid_0's auc: 0.880698
[33]	valid_0's auc: 

New categorical_feature is ['Embarked', 'Parch', 'Pclass', 'Sex', 'SibSp']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [None]:
metrics(y_valid, lgb_clf.predict(X_valid))

Max recall is 1.0 at  threshold 0.01.
Max precision is 1.0 at  threshold 0.43.
Max f1 is 0.3868613138686131 at  threshold 0.41.
roc auc: 0.900197628458498
f1-based confusion matrix:
 [[95 15]
 [16 53]]


# 推論

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/輪読/titanic/test.csv')
sub = pd.read_csv('/content/drive/MyDrive/輪読/titanic/gender_submission.csv')

In [None]:
df_test = preprocess(df_test)
X_test = df_test[features_col]

閾値はf1が最大になる最大になる0.41を採用

In [None]:
y_pred_test = (lgb_clf.predict(X_test) >0.41)*1

In [None]:
sub['Survived'] = y_pred_test
sub

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
sub.to_csv('/content/drive/MyDrive/輪読/submit/submit_baseline.csv', index=False)

スコアは0.7655