In [152]:
import numpy as np
import pandas as pd

In [153]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')

In [154]:
# object型の列名を取得するリストを作成
object_columns = []

# for文を使ってobject型の列を取得
for col in train.columns:
    if train[col].dtype == 'object':
        object_columns.append(col)

#取得した列にダミー変数を入れる
del object_columns[0]#先頭の'id'列を削除
cat_col = object_columns
X=pd.get_dummies(train,columns=cat_col)

In [155]:
X = X.drop(['id'],axis=1)

In [156]:
#K近傍法を実施
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
imputed_data = imputer.fit_transform(X)

In [157]:
columns = X.columns
index = X.index

imputed_df = pd.DataFrame(imputed_data, columns=columns, index=index)

In [158]:
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')

In [159]:
# 共通の列を抽出
common_columns = train.columns.intersection(test.columns)

# 訓練データとテストデータの共通列だけを抽出
train_common = train[common_columns]
test_common = test[common_columns]



# 訓練データ（およびテストデータ）のobject型の列名を取得するリストを作成
object_columns = []

# for文を使ってobject型の列を取得
for col in train_common.columns:
    if train_common[col].dtype == 'object':
        object_columns.append(col)



#訓練データとテストデータを合わせてダミー変数を入れる
data_combined = pd.concat([train_common, test_common], axis=0)
if object_columns:  # 空でない場合のみダミー変数化
    data_combined = pd.get_dummies(data_combined, columns=object_columns)
train_data = data_combined.iloc[:len(train_common), :]
test_data = data_combined.iloc[len(train_common):, :]


In [160]:
#データ中のNaNを0に変換
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

In [161]:
# 訓練データにsii列を追加
train_data['sii'] = imputed_df['sii']

In [162]:
train_data['sii'] = train_data['sii'].astype(int)

In [163]:
X = train_data
columns = X.columns
index = X.index

X = pd.DataFrame(X, columns=columns, index=index)

In [164]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

np.random.seed(42)

In [165]:
#NaNを0に置き換える
X = np.nan_to_num(X, nan=0)

In [166]:
print(train_data.isnull().sum().sum())  # 0 でなければ欠損値が残っている
print(test_data.isnull().sum().sum())  # 0 でなければ欠損値が残っている

0
0


In [167]:
# 特徴データとラベルをX, yに格納
Y_column = 'sii'
y = train_data[Y_column].values
X = train_data.drop(Y_column, axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=15)
print('訓練データ数 : {}, テストデータ数 : {}'.format(len(X_train), len(X_test)))

transformer = StandardScaler()
X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)

訓練データ数 : 2970, テストデータ数 : 990


In [168]:
#ランダムフォレスト
#params = {"max_depth": [2, 4, 6], "n_estimators": [10, 50]}
#params = {"max_depth":[4, 6, 8, 10], "n_estimators":[50, 100, 200]} #探索したいパラメータのdict
params = {"max_depth":[10, 20, 50], "n_estimators":[100, 200, 500]}
rf = RandomForestClassifier(random_state=42) #Random Forestのインスタンス
clf = GridSearchCV(estimator=rf, param_grid=params, scoring='accuracy', cv=5) #CVにより最適なパラメータを探索してくれるインスタンス
clf.fit(X_train, y_train)
pd.DataFrame(clf.cv_results_)[['rank_test_score', 'params', 'mean_test_score', 'std_test_score']].sort_values(by=["rank_test_score"], ascending=True)#CVの結果

Unnamed: 0,rank_test_score,params,mean_test_score,std_test_score
6,1,"{'max_depth': 50, 'n_estimators': 100}",0.628956,0.015155
8,2,"{'max_depth': 50, 'n_estimators': 500}",0.628283,0.010345
0,3,"{'max_depth': 10, 'n_estimators': 100}",0.627273,0.006867
1,4,"{'max_depth': 10, 'n_estimators': 200}",0.626599,0.005039
7,5,"{'max_depth': 50, 'n_estimators': 200}",0.625926,0.011438
2,5,"{'max_depth': 10, 'n_estimators': 500}",0.625926,0.003434
3,7,"{'max_depth': 20, 'n_estimators': 100}",0.625253,0.005387
5,8,"{'max_depth': 20, 'n_estimators': 500}",0.624242,0.005259
4,9,"{'max_depth': 20, 'n_estimators': 200}",0.620202,0.006784


In [169]:
y_pred = clf.predict(X_test) #GridSearchCVはそのまま一番良いパラメータのestimatorとして使える
print(classification_report(y_test, y_pred)) #各指標を計算
print(confusion_matrix(y_test, y_pred)) #コンヒュージョンマトリクスを表示

              precision    recall  f1-score   support

           0       0.66      0.90      0.77       612
           1       0.36      0.19      0.25       279
           2       0.27      0.03      0.06        90
           3       0.00      0.00      0.00         9

    accuracy                           0.62       990
   macro avg       0.32      0.28      0.27       990
weighted avg       0.54      0.62      0.55       990

[[553  55   4   0]
 [223  53   3   0]
 [ 52  35   3   0]
 [  5   3   1   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [170]:
# テストデータに標準化を適用
X_test_standardized = transformer.transform(test_data)



In [171]:
# 予測結果を取得
result = clf.predict(X_test_standardized)

In [172]:
print(result)

[2 0 0 0 0 1 0 0 0 1 1 0 1 1 0 2 0 0 0 0]


In [173]:
df = pd.DataFrame(test['id'], columns=['id'])

In [174]:
result = pd.DataFrame(result, columns=['sii'])

In [175]:
df_result = pd.concat([df, result], axis=1)

In [176]:
df_result.head(50)

Unnamed: 0,id,sii
0,00008ff9,2
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,0
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,0
9,0083e397,1


In [177]:
df_result.to_csv("submission.csv", index=None)