# 【第47回_Beginner限定コンペ】医療保険の費用帯予測
- **目的**： 医療データを使って保険の費用帯を予測しよう！
- **評価指標**：「F1score」（マクロ平均）
- **目的変数**： **"charge"**（価格帯0（低）、1（中）、2（高））

## メモ
- **データ概要**
 - **訓練データ**： 1600行、欠損値なし
 - **テストデータ**： 400行、欠損値なし
- **EDA**
 - 目的変数
   - "0"が1256件、それ以外が100件ずつほどの規模感
   - "age"で層別した結果、関連性はあまり強くはなさそう
   - "bmi"で層別したが、関係性はありそう（目的変数によって分布の山が異なる）
- **モデル**
 - **OneVsRestClassifier**という多クラス分類手法
 - 回帰分析にはロジスティック回帰
- **課題**
 - 交差検証
  - 試行タイミングによって

In [37]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [38]:
# Google Colaboratoryで作業する場合はこちらも実行してください。
from google.colab import drive
drive.mount('/content/drive')
# %cd 以降にこのnotebookを置いているディレクトリを指定してください。
%cd "/content/drive/MyDrive"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive


In [39]:
# データの読み込み
# INPUT_DIRにtrain.csvなどのデータを置いているディレクトリを指定してください。
INPUT_DIR = "./Colab Notebooks/SIGNATE/医療保険の費用帯予測_202406/"

df_train = pd.read_csv(INPUT_DIR + "train.csv")
df_test = pd.read_csv(INPUT_DIR + "test.csv")
sample_sub = pd.read_csv(INPUT_DIR + "sample_submit.csv", header=None)
df_train.head()

Unnamed: 0,id,age,sex,bmi,children,smoker,region,charges
0,0,26,male,32.665465,3,no,southeast,0
1,1,41,male,29.798725,1,no,southwest,0
2,2,28,male,32.722029,0,yes,northwest,1
3,3,20,female,38.429831,2,no,southeast,0
4,4,45,female,29.641854,1,no,northwest,0


In [40]:
sample_sub.head()

Unnamed: 0,0,1
0,13,0
1,23,0
2,27,0
3,28,0
4,29,0


In [41]:
sample_sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       400 non-null    int64
 1   1       400 non-null    int64
dtypes: int64(2)
memory usage: 6.4 KB


In [42]:
# カテゴリカル変数をダミー化
df_train_dummies = pd.get_dummies(df_train, drop_first=True, dtype=int)
df_train_dummies.head()

Unnamed: 0,id,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,0,26,32.665465,3,0,1,0,0,1,0
1,1,41,29.798725,1,0,1,0,0,0,1
2,2,28,32.722029,0,1,1,1,1,0,0
3,3,20,38.429831,2,0,0,0,0,1,0
4,4,45,29.641854,1,0,0,0,1,0,0


In [43]:
# 特徴量を減らす
df_train_selected_01 = df_train_dummies.drop(columns=[
    'id', 'children'
    # ,'region_northwest', 'region_southwest'
    , 'region_southeast'
    ])
df_train_selected_01.head()

Unnamed: 0,age,bmi,charges,sex_male,smoker_yes,region_northwest,region_southwest
0,26,32.665465,0,1,0,0,0
1,41,29.798725,0,1,0,0,1
2,28,32.722029,1,1,1,1,0
3,20,38.429831,0,0,0,0,0
4,45,29.641854,0,0,0,1,0


In [44]:
# 機械学習ライブラリ
import sklearn
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

In [45]:
# データを説明変数と目的変数に分割
X = df_train_selected_01.drop(columns=['charges']).values
y = df_train_selected_01['charges'].values

In [46]:
# ライブラリのインポート
# モデル
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

# モデル評価
# from sklearn.metrics import plot_roc_curve, roc_curve, precision_recall_curve
from sklearn.metrics import roc_auc_score, accuracy_score, hamming_loss
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import classification_report

In [47]:
pip install iterative-stratification



**交差検証しようと思ったが諦める**  
参考
*   https://qiita.com/F8LUUI5kOxLvrmuIAIPwFsUWSKNdgW5N/items/b2a0a70753f47e4e2847
*   https://qiita.com/studio_haneya/items/6cfb25793611b2febc0c

In [48]:
# # OneVsRestClassifierという多クラス分類手法を使用
# # LogisticRegressionという回帰分析のモデルを学習
# # モデル作成
# clf = OneVsRestClassifier(LogisticRegression(random_state=0))

# # 多クラス分類での交差検証
# from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

# msss = MultilabelStratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)

# for train_index, test_index in msss.split(X, y):
#   #  print("TRAIN:", train_index, "TEST:", test_index)
#    X_train, X_test = X[train_index], X[test_index]
#    y_train, y_test = y[train_index], y[test_index]
#    clf.fit(X_train, y_train)
#    y_pred = clf.predict(X_test)
#    print(classification_report(y_test, y_pred,target_names=["0","1","2"]))
#    print(classification_report(y_test, y_pred,target_names=label_names))

**シンプルにとりあえずモデルを構成してみる**  
参考：https://qiita.com/jingwora/items/a708f60d34475dd2feef

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
# データ分割
X_train, X_test, y_train, y_test = train_test_split(X,y)
# モデル作成
clf = OneVsRestClassifier(LogisticRegression(random_state=0))
clf.fit(X_train, y_train)
# モデル推論
scores = clf.predict_proba(X_test)
preds = clf.predict(X_test)

print(f"scores sample: {scores[0]}")
print(f"preds sample: {preds[0]}")

scores sample: [0.92420786 0.07064342 0.00514872]
preds sample: 0


In [51]:
# F1スコアで評価
y_pred = clf.predict(X_test)
label_names = ["0", "1", "2"]
print(classification_report(y_test, y_pred,target_names=label_names))

              precision    recall  f1-score   support

           0       0.92      0.96      0.93       316
           1       0.59      0.42      0.49        53
           2       0.73      0.77      0.75        31

    accuracy                           0.87       400
   macro avg       0.75      0.71      0.72       400
weighted avg       0.86      0.87      0.86       400



In [52]:
# テストデータについても同様の処理を行う
df_test_dummies = pd.get_dummies(df_test, drop_first=True, dtype=int)
df_test_dummies.head()

Unnamed: 0,id,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,13,19,24.365178,1,1,0,0,0,0
1,23,59,33.997763,0,1,1,0,0,0
2,27,42,29.28345,0,0,0,0,0,1
3,28,30,24.903725,1,1,0,1,0,0
4,29,41,29.644536,0,0,0,0,0,1


In [53]:
# 次元削除
df_test_selected_01 = df_test_dummies.drop(columns=[
    'id', 'children'
    # , 'region_northwest', 'region_southwest'
    , 'region_southeast'
    ])
df_test_selected_01.head()

Unnamed: 0,age,bmi,sex_male,smoker_yes,region_northwest,region_southwest
0,19,24.365178,1,0,0,0
1,59,33.997763,1,1,0,0
2,42,29.28345,0,0,0,1
3,30,24.903725,1,0,1,0
4,41,29.644536,0,0,0,1


In [54]:
X_submit = df_test_selected_01.values
y_submit = clf.predict(X_submit )
sample_sub.iloc[:, -1] = y_submit
sample_sub.head()

Unnamed: 0,0,1
0,13,0
1,23,2
2,27,0
3,28,0
4,29,0


In [61]:
# 提出用のcsvファイルを作成
sample_sub.to_csv('submission_991_05.csv',index=False, header=False)
from google.colab import files
files.download('submission_991_05.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**提出スコア（一部のみ評価）**   
0.8067549