In [1]:
cd /content/drive/MyDrive/プログラミング/SIGNATE/国勢調査_収入予測

/content/drive/MyDrive/プログラミング/SIGNATE/国勢調査_収入予測


In [2]:
import numpy as np
import pandas as pd
import re
import pickle
import gc

# scikit-learn
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# LightGBM
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [20]:
train_data = pd.read_csv('train.csv')
display(train_data.head())

Unnamed: 0,index,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,native-country,Y
0,322,21,Private,Some-college,10,Divorced,Adm-clerical,Own-child,White,Female,United-States,0
1,11968,29,Private,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,United-States,0
2,10868,19,Private,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,United-States,0
3,3394,17,Private,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,United-States,0
4,15993,47,Private,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,United-States,0


# 特徴量エンジニアリング

In [19]:
train_data.dtypes

Unnamed: 0,0
index,int64
age,int64
workclass,object
education,object
education-num,int64
marital-status,object
occupation,object
relationship,object
race,object
sex,object


In [21]:
# 文字列の列（カテゴリ変数）を自動検出
cat_columns = train_data.select_dtypes(include=['object']).columns

# カテゴリー変数に変換（Label Encoding）
for col in cat_columns:
    train_data[col] = train_data[col].astype('category')  # カテゴリ型に変換
    train_data[col] = train_data[col].cat.codes  # 数値に変換（Label Encoding）

# モデル作成・評価

In [24]:
# データの準備
X = train_data.drop(columns=['Y'])
y = train_data['Y']

# モデルパラメータ
params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "learning_rate": 0.1,
    "num_leaves": 31,
    "max_depth": -1,
    "random_state": 42
}

# 学習の実行
imp = train_lgb(X, y, params, list_nfold=[0,1,2,3,4], n_splits=5)

display(imp)

-------------------- 0 --------------------
(8834, 11) (2209, 11)
[LightGBM] [Info] Number of positive: 2147, number of negative: 6687
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000932 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 362
[LightGBM] [Info] Number of data points in the train set: 8834, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243038 -> initscore=-1.136094
[LightGBM] [Info] Start training from score -1.136094
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.251672	training's accuracy: 0.883292	valid_1's binary_logloss: 0.344212	valid_1's accuracy: 0.836125
Did not meet early stopping. Best iteration is:
[44]	training's binary_logloss: 0.290235	training's accuracy: 0.866086	valid_1's binary_logloss: 0.3399	valid_1's accuracy: 0.835672
-

Unnamed: 0,col,importance,imp_std
3,index,347.6,121.755082
0,age,314.6,54.307458
6,occupation,201.0,44.221036
2,education-num,139.0,20.603398
1,education,93.4,21.801376
4,marital-status,70.4,11.414903
8,relationship,70.4,9.343447
10,workclass,66.0,16.926311
9,sex,16.4,6.426508
7,race,13.2,2.774887


In [23]:
def train_lgb(input_x,
              input_y,
              params,
              list_nfold=[0,1,2,3,4],
              n_splits=5,
             ):
    metrics = []
    imp = pd.DataFrame()

    # cross-validation
    cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(input_x, input_y))
    for nfold in list_nfold:
        print("-"*20, nfold, "-"*20)

        # make dataset
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = input_x.loc[idx_tr, :], input_y[idx_tr]
        x_va, y_va = input_x.loc[idx_va, :], input_y[idx_va]
        print(x_tr.shape, x_va.shape)

        # カスタムメトリック関数
        def accuracy_metric(y_true, y_pred):
            y_pred_binary = (y_pred > 0.5).astype(int)
            accuracy = (y_true == y_pred_binary).mean()
            return 'accuracy', accuracy, True  # (名前, 値, 大きい方が良いか)

        # train
        model = lgb.LGBMClassifier(**params)

        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr,y_tr), (x_va,y_va)],
                  eval_metric=accuracy_metric,  # カスタムメトリックを指定
                  callbacks=[
                      lgb.early_stopping(stopping_rounds=100, verbose=True),
                      lgb.log_evaluation(100),
                  ]
                 )

        fname_lgb = "/content/drive/MyDrive/プログラミング/SIGNATE/国勢調査_収入予測/model_lgb_fold{}.pickle".format(nfold)
        with open(fname_lgb, "wb") as f:
            pickle.dump(model, f, protocol=4)

        # 予測と精度の計算
        y_tr_pred = (model.predict_proba(x_tr)[:, 1] > 0.5).astype(int)
        y_va_pred = (model.predict_proba(x_va)[:, 1] > 0.5).astype(int)

        tr_accuracy = accuracy_score(y_tr, y_tr_pred)
        va_accuracy = accuracy_score(y_va, y_va_pred)

        metrics.append([nfold, tr_accuracy, va_accuracy])

        # imp
        _imp = pd.DataFrame({"col":input_x.columns, "importance":model.feature_importances_, "nfold":nfold})
        imp = pd.concat([imp, _imp])

    print("-"*20, "result", "-"*20)
    # metric
    metrics = np.array(metrics)
    print(metrics)
    print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(
        metrics[:,1].mean(), metrics[:,1].std(),
        metrics[:,2].mean(), metrics[:,2].std(),
    ))

    # importance
    imp = imp.groupby("col")["importance"].agg(["mean", "std"]).reset_index(drop=False)
    imp.columns = ["col", "importance", "imp_std"]
    imp = imp.sort_values(by="importance", ascending=False)

    return imp

# 推論提出部分

In [27]:
def predict_lgb(input_x, list_nfold=[0, 1, 2, 3, 4]):
  # 推論値を格納する変数を作成
  test_pred = np.zeros((len(input_x), len(list_nfold)))

  for nfold in list_nfold:
    print('-'*20, nfold, '-'*20)
    with open("/content/drive/MyDrive/プログラミング/SIGNATE/国勢調査_収入予測/model_lgb_fold{}.pickle".format(nfold), "rb") as f:
      model = pickle.load(f)

    # 推論
    test_pred_fold = model.predict(input_x)

    # 1fold目の予測値を格納
    test_pred[:, nfold] = test_pred_fold

  # 各foldの推論値の平均値を算出
  test_pred_mean = test_pred.mean(axis=1)
  final_test_preds = (test_pred_mean > 0.791).astype(int)
  print('Done.')

  return final_test_preds

In [25]:
test_data = pd.read_csv('test.csv')
sample_sub = pd.read_csv('sample_submit.csv')

# 文字列の列（カテゴリ変数）を自動検出
cat_columns = test_data.select_dtypes(include=['object']).columns

# カテゴリー変数に変換（Label Encoding）
for col in cat_columns:
    test_data[col] = test_data[col].astype('category')  # カテゴリ型に変換
    test_data[col] = test_data[col].cat.codes  # 数値に変換（Label Encoding）

In [28]:
test_pred = predict_lgb(test_data, list_nfold=[0,1,2,3,4])

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


In [30]:
sample_sub

Unnamed: 0,3625,0
0,3028,0
1,13814,0
2,15398,0
3,13244,0
4,2516,0
...,...,...
4750,9821,0
4751,6726,0
4752,1501,0
4753,6391,0


In [32]:
test_pred

array([0, 0, 0, ..., 0, 0, 1])

In [33]:
sub_data = pd.concat([test_data['index'], pd.DataFrame(test_pred)], axis=1)
sub_data.columns = [3625, 0]  # 新しいカラム名をリストで指定
sub_data = sub_data.iloc[1:].reset_index(drop=True)

# ファイル出力
sub_data.to_csv("/content/drive/MyDrive/プログラミング/SIGNATE/国勢調査_収入予測/submission2月15日1回目.csv", index=None)

In [39]:
a = pd.read_csv('submission2月15日1回目.csv')

Unnamed: 0_level_0,count
0,Unnamed: 1_level_1
0,3797
1,958
