In [44]:
!pip install lightgbm



In [45]:
import numpy as np
import pandas as pd

In [46]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')

In [47]:
# object型の列名を取得するリストを作成
object_columns = []

# for文を使ってobject型の列を取得
for col in train.columns:
    if train[col].dtype == 'object':
        object_columns.append(col)

#取得した列にダミー変数を入れる
del object_columns[0]#先頭の'id'列を削除
cat_col = object_columns
X=pd.get_dummies(train,columns=cat_col)

In [48]:
X = X.drop(['id'],axis=1)

In [49]:
#K近傍法を実施
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
imputed_data = imputer.fit_transform(X)

In [50]:
columns = X.columns
index = X.index

imputed_df = pd.DataFrame(imputed_data, columns=columns, index=index)

In [51]:
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')

In [52]:
# 共通の列を抽出
common_columns = train.columns.intersection(test.columns)

# 訓練データとテストデータの共通列だけを抽出
train_common = train[common_columns]
test_common = test[common_columns]



# 訓練データ（およびテストデータ）のobject型の列名を取得するリストを作成
object_columns = []

# for文を使ってobject型の列を取得
for col in train_common.columns:
    if train_common[col].dtype == 'object':
        object_columns.append(col)



#訓練データとテストデータを合わせてダミー変数を入れる
data_combined = pd.concat([train_common, test_common], axis=0)
if object_columns:  # 空でない場合のみダミー変数化
    data_combined = pd.get_dummies(data_combined, columns=object_columns)
train_data = data_combined.iloc[:len(train_common), :]
test_data = data_combined.iloc[len(train_common):, :]


In [53]:
#データ中のNaNを0に変換
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

In [54]:
# 訓練データにsii列を追加
train_data['sii'] = imputed_df['sii']

In [55]:
#train_data['sii'] = train_data['sii'].astype(int)

In [56]:
X = train_data
columns = X.columns
index = X.index

X = pd.DataFrame(X, columns=columns, index=index)

In [57]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# データを読み込む
df = X

# 特徴量とターゲットを分ける
X = df.drop(columns=['sii'])  # 'target'は目的変数の列名に置き換える
y = df['sii']

# 訓練データと検証データに分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

# LightGBM用のデータセットを作成
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, free_raw_data=False)

# ハイパーパラメータの設定
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': 42
}

# モデルの学習
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=1000,
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=100)
    ]
)

# 検証データで予測
y_pred = model.predict(X_val, num_iteration=model.best_iteration)

# RMSEを計算
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"Validation RMSE: {rmse}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019854 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6006
[LightGBM] [Info] Number of data points in the train set: 3168, number of used features: 88
[LightGBM] [Info] Start training from score 0.594571
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[25]	training's rmse: 0.491603	valid_1's rmse: 0.586948
Validation RMSE: 0.5869477317640935


In [59]:
# テストデータを読み込む
test_df = test_data

# テストデータで予測
predictions = model.predict(test_df, num_iteration=model.best_iteration)

In [60]:
predictions = pd.DataFrame(predictions, columns=['sii'])

In [61]:
df = pd.DataFrame(test['id'], columns=['id'])

In [62]:
df_predictions = pd.concat([df, predictions], axis=1)

In [65]:
df_predictions.head(50)

Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,0
5,001f3379,0
6,0038ba98,0
7,0068a485,0
8,0069fbed,0
9,0083e397,0


In [64]:
df_predictions['sii'] = df_predictions['sii'].astype(int)

In [67]:
df_predictions.to_csv("submission.csv", index=None)