In [1]:
import lightgbm as lgb
import pandas as pd

# 讀取資料集

In [2]:
dataset = pd.read_csv("train.csv")
X_train = dataset.drop(["PerNo", "PerStatus", "歸屬部門"], axis=1)
y_train = dataset["PerStatus"]

X_test = pd.read_csv("test.csv").drop(["PerNo", "PerStatus", "歸屬部門"], axis=1)

# Fill Missing Data

In [3]:
categorical_feature = ["工作分類", "廠區代碼", "工作地點", "畢業學校類別", "畢業科系類別"]

for cat in categorical_feature:
    X_train[cat] = X_train[cat].fillna(X_train[cat].value_counts().index[0])    # most frequent
    X_test[cat] = X_test[cat].fillna(X_test[cat].value_counts().index[0])

X_train = X_train.fillna(X_train.mean())    # mean
X_test = X_test.fillna(X_test.mean())

# Regularization

In [4]:
from sklearn.preprocessing import StandardScaler

X_train_values = X_train.values
X_test_values = X_test.values
scaler = StandardScaler()

X_train_values = scaler.fit_transform(X_train_values)
X_test_values = scaler.transform(X_test_values)

X_train = pd.DataFrame(data=X_train_values, columns=X_train.columns)
X_test = pd.DataFrame(data=X_test_values, columns=X_test.columns)

# 訓練 LightGBM

In [5]:
weight = []
for i in range(len(y_train)):
    if y_train[i] == 1:
        weight.append(13)
    else:
        weight.append(1)

In [6]:
train_data = lgb.Dataset(data=X_train, label=y_train, feature_name=X_train.columns.to_list(), categorical_feature=categorical_feature, weight=weight)
param = {
    'objective': 'binary',
}

num_round = 10
bst = lgb.train(param, train_data, num_round)

[LightGBM] [Info] Number of positive: 796, number of negative: 13596
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 460
[LightGBM] [Info] Number of data points in the train set: 14392, number of used features: 44
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.432175 -> initscore=-0.272982
[LightGBM] [Info] Start training from score -0.272982


In [7]:
y_pred = bst.predict(X_test)

In [8]:
for i in range(len(y_pred)):
    y_pred[i] = 1 if y_pred[i] >= 0.5 else 0

y_pred = y_pred.astype("int")

In [9]:
y_pred.sum()

539

# 儲存預測結果

In [10]:
submission = pd.read_csv("submission.csv").drop(["PerStatus"], axis=1)
submission.insert(loc=1, column="PerStatus", value=y_pred)
submission.to_csv("mysubmission.csv", index=False)