<a href="https://colab.research.google.com/github/Jakelinecs/Tareas-Machine-Learning/blob/main/N15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

try:
    app_train = pd.read_csv('application_train.csv')
    app_test = pd.read_csv('application_test.csv')
except FileNotFoundError:
    print("❌ CSVファイルが見つかりません。ファイルを配置してください。")
    app_train = pd.DataFrame()
    app_test = pd.DataFrame()
    exit()

y = app_train['TARGET']
train_ids = app_train['SK_ID_CURR']
test_ids = app_test['SK_ID_CURR']

app_train = app_train.drop(columns=['TARGET'])
data = pd.concat([app_train, app_test], ignore_index=True)

le = LabelEncoder()
for col in data.columns:
    if data[col].dtype == 'object':
        if len(list(data[col].unique())) <= 2:
            data[col] = le.fit_transform(data[col].astype(str))

data = pd.get_dummies(data)

data['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace=True)

for col in data.columns:
    if data[col].isnull().sum() > 0 and data[col].dtype != 'object':
        data[col].fillna(data[col].median(), inplace=True)

app_train = data[data['SK_ID_CURR'].isin(train_ids)].copy()
app_test = data[data['SK_ID_CURR'].isin(test_ids)].copy()

scaler = MinMaxScaler(feature_range=(0, 1))
X = app_train.drop(columns=['SK_ID_CURR']).values
X_test = app_test.drop(columns=['SK_ID_CURR']).values
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

print(f"訓練データ最終形状: {X_scaled.shape}")

訓練データ最終形状: (250491, 242)


In [4]:
X_train_val, X_val, y_train_val, y_val = train_test_split(
    X_scaled, y, test_size=0.1, random_state=42
)

logreg = LogisticRegression(C=0.0001, solver='liblinear', max_iter=1000, random_state=42)

logreg.fit(X_train_val, y_train_val)

y_pred_proba_val = logreg.predict_proba(X_val)[:, 1]

auc_baseline = roc_auc_score(y_val, y_pred_proba_val)

print("\n--- ベースラインモデルの検証結果 ---")
print(f"モデル: ロジスティック回帰")
print(f"検証データ ROC AUC: {auc_baseline:.4f}")


--- ベースラインモデルの検証結果 ---
モデル: ロジスティック回帰
検証データ ROC AUC: 0.6749


In [5]:
y_pred_proba_test = logreg.predict_proba(X_test_scaled)[:, 1]

submission_baseline = pd.DataFrame({
    'SK_ID_CURR': test_ids,
    'TARGET': y_pred_proba_test
})

submission_baseline.to_csv('submission_baseline.csv', index=False)

print("\n✅ ベースラインの提出ファイル (submission_baseline.csv) を作成しました。")
print("これをKaggleに提出することで提出フローを確認できます。")
print(submission_baseline.head())


✅ ベースラインの提出ファイル (submission_baseline.csv) を作成しました。
これをKaggleに提出することで提出フローを確認できます。
   SK_ID_CURR    TARGET
0      100001  0.067481
1      100005  0.121575
2      100013  0.088444
3      100028  0.059376
4      100038  0.122836


In [6]:
def feature_engineer(data_df):
    df_fe = data_df.copy()

    df_fe['AGE'] = df_fe['DAYS_BIRTH'] / -365.25

    df_fe['CREDIT_INCOME_RATIO'] = df_fe['AMT_CREDIT'] / (df_fe['AMT_INCOME_TOTAL'] + 1e-6)

    df_fe['EXT_SOURCE_PROD'] = df_fe['EXT_SOURCE_1'] * df_fe['EXT_SOURCE_2'] * df_fe['EXT_SOURCE_3']

    df_fe['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace=True)

    return df_fe

fe_results = {}

from sklearn.ensemble import RandomForestClassifier

fe_results['P0_Baseline_LogReg'] = {'AUC': auc_baseline, 'Features': '全特徴量 (ベースライン前処理)', 'Model': 'LogisticRegression'}


data_fe = feature_engineer(data.copy())

data_fe = pd.get_dummies(data_fe)
app_train_fe = data_fe[data_fe['SK_ID_CURR'].isin(train_ids)].copy()
app_test_fe = data_fe[data_fe['SK_ID_CURR'].isin(test_ids)].copy()

for col in app_train_fe.columns:
    if app_train_fe[col].isnull().sum() > 0:
        median_val = app_train_fe[col].median()
        app_train_fe[col].fillna(median_val, inplace=True)
        app_test_fe[col].fillna(median_val, inplace=True)

all_cols = [col for col in app_train_fe.columns if col not in ['SK_ID_CURR', 'TARGET']]

X_fe_1 = app_train_fe[all_cols].values
X_test_fe_1 = app_test_fe[all_cols].values
X_fe_1_scaled = scaler.fit_transform(X_fe_1)
X_test_fe_1_scaled = scaler.transform(X_test_fe_1)

X_train_val_1, X_val_1, y_train_val_1, y_val_1 = train_test_split(X_fe_1_scaled, y, test_size=0.1, random_state=42)

model_rf_1 = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42, n_jobs=-1)
model_rf_1.fit(X_train_val_1, y_train_val_1)
y_pred_1 = model_rf_1.predict_proba(X_val_1)[:, 1]
auc_1 = roc_auc_score(y_val_1, y_pred_1)
fe_results['P1_All_FE_RF'] = {'AUC': auc_1, 'Features': '全特徴量 + 3新規特徴量', 'Model': 'RandomForest'}


ext_source_cols = [col for col in all_cols if 'EXT_SOURCE' in col or 'CREDIT_INCOME_RATIO' in col or 'EXT_SOURCE_PROD' in col]
X_fe_2 = app_train_fe[ext_source_cols].values
X_test_fe_2 = app_test_fe[ext_source_cols].values
X_fe_2_scaled = scaler.fit_transform(X_fe_2)
X_test_fe_2_scaled = scaler.transform(X_test_fe_2)

X_train_val_2, X_val_2, y_train_val_2, y_val_2 = train_test_split(X_fe_2_scaled, y, test_size=0.1, random_state=42)

model_rf_2 = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42, n_jobs=-1)
model_rf_2.fit(X_train_val_2, y_train_val_2)
y_pred_2 = model_rf_2.predict_proba(X_val_2)[:, 1]
auc_2 = roc_auc_score(y_val_2, y_pred_2)
fe_results['P2_EXT_Only_FE_RF'] = {'AUC': auc_2, 'Features': 'EXT_SOURCE, CREDIT_INCOME_RATIOのみ', 'Model': 'RandomForest'}


logreg_2 = LogisticRegression(C=0.0001, solver='liblinear', max_iter=1000, random_state=42)
logreg_2.fit(X_train_val_1, y_train_val_1) # X_fe_1_scaled を使用
y_pred_3 = logreg_2.predict_proba(X_val_1)[:, 1]
auc_3 = roc_auc_score(y_val_1, y_pred_3)
fe_results['P3_All_FE_LogReg'] = {'AUC': auc_3, 'Features': '全特徴量 + 3新規特徴量', 'Model': 'LogisticRegression'}


ext_source_base_cols = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']
X_fe_4 = app_train[ext_source_base_cols].values
X_test_fe_4 = app_test[ext_source_base_cols].values
X_fe_4_scaled = scaler.fit_transform(X_fe_4)
X_test_fe_4_scaled = scaler.transform(X_test_fe_4)

X_train_val_4, X_val_4, y_train_val_4, y_val_4 = train_test_split(X_fe_4_scaled, y, test_size=0.1, random_state=42)

model_rf_4 = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42, n_jobs=-1)
model_rf_4.fit(X_train_val_4, y_train_val_4)
y_pred_4 = model_rf_4.predict_proba(X_val_4)[:, 1]
auc_4 = roc_auc_score(y_val_4, y_pred_4)
fe_results['P4_EXT_Only_Base_RF'] = {'AUC': auc_4, 'Features': 'EXT_SOURCEのみ (ベースライン)', 'Model': 'RandomForest'}


df_fe_results = pd.DataFrame(fe_results).T[['AUC', 'Features', 'Model']].sort_values(by='AUC', ascending=False)
print("\n--- 特徴量エンジニアリングとモデルの性能比較（5パターン以上） ---")
print(df_fe_results)


--- 特徴量エンジニアリングとモデルの性能比較（5パターン以上） ---
                          AUC                           Features  \
P1_All_FE_RF          0.72416                      全特徴量 + 3新規特徴量   
P2_EXT_Only_FE_RF    0.716702  EXT_SOURCE, CREDIT_INCOME_RATIOのみ   
P4_EXT_Only_Base_RF  0.714654              EXT_SOURCEのみ (ベースライン)   
P3_All_FE_LogReg     0.686181                      全特徴量 + 3新規特徴量   
P0_Baseline_LogReg   0.674899                   全特徴量 (ベースライン前処理)   

                                  Model  
P1_All_FE_RF               RandomForest  
P2_EXT_Only_FE_RF          RandomForest  
P4_EXT_Only_Base_RF        RandomForest  
P3_All_FE_LogReg     LogisticRegression  
P0_Baseline_LogReg   LogisticRegression  


In [7]:
final_model = model_rf_1
X_test_final = X_test_fe_1_scaled

y_pred_proba_final = final_model.predict_proba(X_test_final)[:, 1]

submission_final = pd.DataFrame({
    'SK_ID_CURR': test_ids,
    'TARGET': y_pred_proba_final
})

submission_final.to_csv('submission_final.csv', index=False)

print("\n--- 最終提出ファイル ---")
print(f"最終モデル: ランダムフォレスト (P1)")
print(f"最終提出ファイル (submission_final.csv) を作成しました。")
print(submission_final.head())


--- 最終提出ファイル ---
最終モデル: ランダムフォレスト (P1)
最終提出ファイル (submission_final.csv) を作成しました。
   SK_ID_CURR    TARGET
0      100001  0.077532
1      100005  0.091831
2      100013  0.063223
3      100028  0.048404
4      100038  0.116927
