In [1]:
# 03_feature_engineering.ipynb

import pandas as pd
import numpy as np

# -------------------------------------------------------------------
# 1) ANA TRAIN VERİSİNİ YÜKLE
# -------------------------------------------------------------------
train = pd.read_csv("../data/raw/application_train.csv")

print("Initial shape:", train.shape)

# -------------------------------------------------------------------
# 2) DAYS KOLONLARININ DÖNÜŞÜMÜ
#    - Negatif günleri pozitif yap
#    - Yaşı yıl cinsine çevir
#    - DAYS_EMPLOYED 365243 -> NaN (özel kod)
# -------------------------------------------------------------------
days_cols = [col for col in train.columns if "DAYS_" in col]

for col in days_cols:
    train[col] = train[col].abs()

# Özel durum: 365243 = bilinmeyen / çalışmıyor
if "DAYS_EMPLOYED" in train.columns:
    train["DAYS_EMPLOYED"].replace(365243, np.nan, inplace=True)

# Yaş (yıl cinsinden)
if "DAYS_BIRTH" in train.columns:
    train["AGE"] = train["DAYS_BIRTH"] / 365

# -------------------------------------------------------------------
# 3) LOG TRANSFORM (Aşırı büyük değerleri bastırmak için)
# -------------------------------------------------------------------
log_cols = [
    "AMT_INCOME_TOTAL",
    "AMT_CREDIT",
    "AMT_ANNUITY",
    "AMT_GOODS_PRICE",
]

for col in log_cols:
    if col in train.columns:
        train[col + "_LOG"] = np.log1p(train[col])

# -------------------------------------------------------------------
# 4) FİNANSAL ORANLAR (RATIOS)
# -------------------------------------------------------------------
# Kredi / Gelir Oranı
if set(["AMT_CREDIT", "AMT_INCOME_TOTAL"]).issubset(train.columns):
    train["DEBT_INCOME_RATIO"] = train["AMT_CREDIT"] / (train["AMT_INCOME_TOTAL"] + 1)

# Kredi / Taksit Oranı
if set(["AMT_CREDIT", "AMT_ANNUITY"]).issubset(train.columns):
    train["CREDIT_ANNUITY_RATIO"] = train["AMT_CREDIT"] / (train["AMT_ANNUITY"] + 1)

# Kişi başı gelir
if set(["AMT_INCOME_TOTAL", "CNT_FAM_MEMBERS"]).issubset(train.columns):
    train["INCOME_PER_PERSON"] = train["AMT_INCOME_TOTAL"] / (train["CNT_FAM_MEMBERS"] + 1)

# Ödeme yükü (taksit / kredi)
if set(["AMT_ANNUITY", "AMT_CREDIT"]).issubset(train.columns):
    train["PAYMENT_RATE"] = train["AMT_ANNUITY"] / (train["AMT_CREDIT"] + 1)

# -------------------------------------------------------------------
# 5) EXTERNAL SCORE FE (ORTALAMA / MIN / MAX)
# -------------------------------------------------------------------
ext_cols = [c for c in ["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"] if c in train.columns]

if len(ext_cols) > 0:
    train["EXT_SOURCE_MEAN"] = train[ext_cols].mean(axis=1)
    train["EXT_SOURCE_MIN"] = train[ext_cols].min(axis=1)
    train["EXT_SOURCE_MAX"] = train[ext_cols].max(axis=1)

# -------------------------------------------------------------------
# YARDIMCI FONKSİYON: AGGREGATION YARATIP MERGE ETME
# -------------------------------------------------------------------
def agg_and_merge(df, group_key, agg_dict, prefix, base_df):
    """
    df       : aggregate yapılacak dataframe
    group_key: grup anahtarı (ör: 'SK_ID_CURR')
    agg_dict : {'kolon': ['mean', 'max', ...]}
    prefix   : yeni kolonlara eklenecek prefix
    base_df  : train datası (merge edilecek ana df)
    """
    agg = df.groupby(group_key).agg(agg_dict)
    agg.columns = [prefix + "_".join(col).upper() for col in agg.columns]
    agg.reset_index(inplace=True)
    base_df = base_df.merge(agg, on=group_key, how="left")
    return base_df

# -------------------------------------------------------------------
# 6) BUREAU (Kredi geçmişi) AGGREGATION
# -------------------------------------------------------------------
try:
    bureau = pd.read_csv("../data/raw/bureau.csv")
    print("bureau shape:", bureau.shape)

    bureau_agg_dict = {
        "AMT_CREDIT_SUM": ["mean", "max"],
        "AMT_CREDIT_SUM_DEBT": ["mean", "max"],
        "AMT_CREDIT_SUM_OVERDUE": ["mean", "max"],
        "CREDIT_DAY_OVERDUE": ["mean", "max"],
        "CNT_CREDIT_PROLONG": ["mean", "sum"],
    }

    # Sadece df'de olan kolonları kullan
    bureau_agg_dict = {
        k: v for k, v in bureau_agg_dict.items() if k in bureau.columns
    }

    if len(bureau_agg_dict) > 0:
        train = agg_and_merge(
            bureau,
            group_key="SK_ID_CURR",
            agg_dict=bureau_agg_dict,
            prefix="BUREAU_",
            base_df=train,
        )

except FileNotFoundError:
    print("bureau.csv bulunamadı, bu bölüm atlandı.")

# -------------------------------------------------------------------
# 7) PREVIOUS_APPLICATION (Geçmiş kredi başvuruları) AGGREGATION
# -------------------------------------------------------------------
try:
    prev = pd.read_csv("../data/raw/previous_application.csv")
    print("previous_application shape:", prev.shape)

    prev_agg_dict = {
        "AMT_CREDIT": ["mean", "max"],
        "AMT_ANNUITY": ["mean"],
        "HOUR_APPR_PROCESS_START": ["mean"],
        "RATE_DOWN_PAYMENT": ["mean"],
        "CNT_PAYMENT": ["mean"],
    }

    prev_agg_dict = {
        k: v for k, v in prev_agg_dict.items() if k in prev.columns
    }

    if len(prev_agg_dict) > 0:
        train = agg_and_merge(
            prev,
            group_key="SK_ID_CURR",
            agg_dict=prev_agg_dict,
            prefix="PREV_",
            base_df=train,
        )

except FileNotFoundError:
    print("previous_application.csv bulunamadı, bu bölüm atlandı.")

# -------------------------------------------------------------------
# 8) INSTALLMENTS_PAYMENTS (Taksit ödeme davranışları) AGGREGATION
# -------------------------------------------------------------------
try:
    install = pd.read_csv("../data/raw/installments_payments.csv")
    print("installments_payments shape:", install.shape)

    # Yeni oran ve fark kolonları
    if set(["AMT_PAYMENT", "AMT_INSTALMENT"]).issubset(install.columns):
        install["PAYMENT_DIFF"] = install["AMT_PAYMENT"] - install["AMT_INSTALMENT"]
        install["PAYMENT_RATIO"] = install["AMT_PAYMENT"] / (install["AMT_INSTALMENT"] + 1)

    inst_agg_dict = {
        "PAYMENT_DIFF": ["mean"],
        "PAYMENT_RATIO": ["mean"],
        "DAYS_INSTALMENT": ["mean"],
        "DAYS_ENTRY_PAYMENT": ["mean"],
        "NUM_INSTALMENT_NUMBER": ["max"],
    }

    inst_agg_dict = {
        k: v for k, v in inst_agg_dict.items() if k in install.columns
    }

    if len(inst_agg_dict) > 0:
        train = agg_and_merge(
            install,
            group_key="SK_ID_CURR",
            agg_dict=inst_agg_dict,
            prefix="INST_",
            base_df=train,
        )

except FileNotFoundError:
    print("installments_payments.csv bulunamadı, bu bölüm atlandı.")

# -------------------------------------------------------------------
# 9) CREDIT_CARD_BALANCE (Kredi kartı davranışı) AGGREGATION
# -------------------------------------------------------------------
try:
    ccb = pd.read_csv("../data/raw/credit_card_balance.csv")
    print("credit_card_balance shape:", ccb.shape)

    ccb_agg_dict = {
        "AMT_BALANCE": ["mean", "max"],
        "AMT_CREDIT_LIMIT_ACTUAL": ["mean"],
        "SK_DPD": ["mean", "max"],  # gecikme günleri
        "SK_DPD_DEF": ["mean", "max"],
    }

    ccb_agg_dict = {
        k: v for k, v in ccb_agg_dict.items() if k in ccb.columns
    }

    if len(ccb_agg_dict) > 0:
        train = agg_and_merge(
            ccb,
            group_key="SK_ID_CURR",
            agg_dict=ccb_agg_dict,
            prefix="CCB_",
            base_df=train,
        )

except FileNotFoundError:
    print("credit_card_balance.csv bulunamadı, bu bölüm atlandı.")

# -------------------------------------------------------------------
# 10) POS_CASH_BALANCE (POS / nakit krediler) AGGREGATION – İSTEĞE BAĞLI
# -------------------------------------------------------------------
try:
    pos = pd.read_csv("../data/raw/POS_CASH_balance.csv")
    print("POS_CASH_balance shape:", pos.shape)

    pos_agg_dict = {
        "MONTHS_BALANCE": ["mean", "min"],
        "CNT_INSTALMENT": ["mean", "max"],
        "CNT_INSTALMENT_FUTURE": ["mean", "max"],
        "SK_DPD": ["mean", "max"],
        "SK_DPD_DEF": ["mean", "max"],
    }

    pos_agg_dict = {
        k: v for k, v in pos_agg_dict.items() if k in pos.columns
    }

    if len(pos_agg_dict) > 0:
        train = agg_and_merge(
            pos,
            group_key="SK_ID_CURR",
            agg_dict=pos_agg_dict,
            prefix="POS_",
            base_df=train,
        )

except FileNotFoundError:
    print("POS_CASH_balance.csv bulunamadı, bu bölüm atlandı.")

# -------------------------------------------------------------------
# 11) SON TEMİZLİK: SAYISAL KOLONLARIN NAs'LERİNİ MEDIAN İLE DOLDUR
#     (İstersen bu kısmı 04. notebook'ta da yapabilirsin, ama burada
#      yaparsan train_fe model için direkt hazır olur.)
# -------------------------------------------------------------------
numeric_cols = train.select_dtypes(include=[np.number]).columns
train[numeric_cols] = train[numeric_cols].fillna(train[numeric_cols].median())

print("Final shape after FE:", train.shape)

# -------------------------------------------------------------------
# 12) PROCESSED DOSYAYI KAYDET
# -------------------------------------------------------------------
train.to_csv("../data/processed/train_fe.csv", index=False)
print("Processed FE dataset saved to ../data/processed/train_fe.csv")


Initial shape: (307511, 122)
bureau.csv bulunamadı, bu bölüm atlandı.
previous_application.csv bulunamadı, bu bölüm atlandı.
installments_payments.csv bulunamadı, bu bölüm atlandı.
credit_card_balance.csv bulunamadı, bu bölüm atlandı.
POS_CASH_balance.csv bulunamadı, bu bölüm atlandı.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["DAYS_EMPLOYED"].replace(365243, np.nan, inplace=True)


Final shape after FE: (307511, 134)
Processed FE dataset saved to ../data/processed/train_fe.csv
