In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

drive.mount("gdrive")
%cd gdrive/MyDrive/GCI/competition2

train = pd.read_csv("input/train.csv")
test = pd.read_csv("input/test.csv")
sample_sub = pd.read_csv("input/sample_submission.csv")

def remove_outliers(train, test):
    train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
    test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
    train = train[train["AMT_INCOME_TOTAL"] < 1e+6]
    train = train[train['CODE_GENDER'] != 'XNA']
    return train, test

train_1, test_1 = remove_outliers(train, test)

def one_hot_encoding(df, prefix):
    return_df = pd.get_dummies(df, drop_first=True, prefix=prefix)
    return return_df

one_hot_columns = ["NAME_CONTRACT_TYPE", "CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY",
                    "NAME_TYPE_SUITE", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE",
                    "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "OCCUPATION_TYPE",
                    "ORGANIZATION_TYPE"]

df = pd.concat([train_1, test_1], axis=0)

for column in one_hot_columns:
    one_hot_df = one_hot_encoding(df[column], column)
    df.drop(column, axis=1, inplace=True)
    df = pd.concat([df, one_hot_df], axis=1)


df['CREDIT_ANNUITY'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
df['CREDIT_GOODS_PRICE'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
df['INCOME_TOTAL_ANNUITY'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
df['INCOME_TOTAL_CREDIT'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
df['DAYS_BIRTH_365_OWN_CAR_AGE'] = (df['DAYS_BIRTH'] / 365) - df['OWN_CAR_AGE']

df['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)

x1 = df['EXT_SOURCE_1'] + 0.1
x2 = df['EXT_SOURCE_2'] + 0.1
x3 = df['EXT_SOURCE_3'] + 0.1

df["x2/x1"] = x2/x1

df.reset_index(drop=True, inplace=True)

train_2 = df[df.index < 232559 - 61500]
test_2 = df[df.index >= 232559 - 61500]

x_train = train_2.drop(columns=['TARGET', 'SK_ID_CURR'])
y_train = train_2['TARGET']
x_test = test_2.drop(columns=["TARGET", 'SK_ID_CURR'])

x_train = x_train.values
y_train = y_train.values
x_test = x_test.values

searched_params = {'objective': 'binary:logistic',
        "eval_metric": "auc",
        'max_depth': 3,
        'eta': 0.040098084171830486, 'n_estimators': 1984,
        "booster": "gbtree",
        'alpha': 8.396710484747206,
        'lambda': 0.015796058058019646,
        'min_child_weight': 1}

#xgboost
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv = list(folds.split(x_train, y_train))
oof_preds_xgb = np.zeros(x_train.shape[0])
models = []

for i, (idx_train, idx_valid) in enumerate(cv):
    x_train_split, y_train_split = x_train[idx_train], y_train[idx_train]
    x_valid_split, y_valid_split = x_train[idx_valid], y_train[idx_valid]

    clf = XGBClassifier(random_state=0, **searched_params)
    clf.fit(x_train_split, y_train_split)

    models.append(clf)
    oof_preds_xgb[idx_valid] = clf.predict_proba(x_valid_split)[:, 1]
    print('Fold %2d AUC : %.6f' % (i + 1, roc_auc_score(y_valid_split, oof_preds_xgb[idx_valid])))

score = roc_auc_score(y_train, oof_preds_xgb)
print("Full AUC score {:.6f}".format(score))

pred_list = []
for model in models:
    pred_list.append(model.predict_proba(x_test)[:, 1])

pred_xgb = np.mean(np.array(pred_list), axis=0)


df["delta_days_1"] = df["DAYS_BIRTH"] - df["DAYS_REGISTRATION"]
df["delta_days_2"] = df["DAYS_BIRTH"] - df["DAYS_ID_PUBLISH"]
df["delta_days_3"] = df["DAYS_REGISTRATION"] - df["DAYS_ID_PUBLISH"]
df["same_delta_count"] = df.apply(lambda row: ((row["delta_days_1"] == df["delta_days_1"]) & (row["delta_days_2"] == df["delta_days_2"])\
                                               & (row["delta_days_3"] == df["delta_days_3"])).sum(), axis=1)
df["same_delta_count"] = df["same_delta_count"] - 1
df.drop("delta_days_1", inplace=True, axis=1)
df.drop("delta_days_2", inplace=True, axis=1)
df.drop("delta_days_3", inplace=True, axis=1)

df["xgb"] = np.concatenate([oof_preds_xgb, pred_xgb])

def calculate_rolling_average(series, window_size=500):
    return series.shift().rolling(window=window_size, min_periods=1).mean()

df["xgb_500_avg"] = calculate_rolling_average(df["xgb"])

df.drop("xgb", inplace=True, axis=1)

df.reset_index(drop=True, inplace=True)

df.to_csv("df_csv.csv", index=False)
df = pd.read_csv("df_csv.csv")

train_2 = df[df.index < 232559 - 61500]
test_2 = df[df.index >= 232559 - 61500]

train_2 = df[df.index < 232559 - 61500]
test_2 = df[df.index >= 232559 - 61500]

x_train = train_2.drop(columns=['TARGET', 'SK_ID_CURR'])
y_train = train_2['TARGET']
x_test = test_2.drop(columns=["TARGET", 'SK_ID_CURR'])

x_train = x_train.values
y_train = y_train.values
x_test = x_test.values


searched_params = {
    "objective": "binary",
    "metric": "auc",
    "num_leaves": 20,
    'num_iterations': 2245,
    'learning_rate': 0.01091379083032654,
    "boosting": "goss",
    'reg_alpha': 3.3284056076797484, 'reg_lambda': 95.72759363032722,
    "max_bin": 851,
    'feature_fraction': 0.66,
    'min_data_in_leaf': 58,
    "is_unbalance": True
}

#light gbm
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv = list(folds.split(x_train, y_train))
oof_preds_lgbm = np.zeros(x_train.shape[0])
models = []

for i, (idx_train, idx_valid) in enumerate(cv):
    x_train_split, y_train_split = x_train[idx_train], y_train[idx_train]
    x_valid_split, y_valid_split = x_train[idx_valid], y_train[idx_valid]

    clf = LGBMClassifier(random_state=0, verbose=-1, **searched_params)
    clf.fit(x_train_split, y_train_split)

    models.append(clf)
    oof_preds_lgbm[idx_valid] = clf.predict_proba(x_valid_split)[:, 1]
    print('Fold %2d AUC : %.6f' % (i + 1, roc_auc_score(y_valid_split, oof_preds_lgbm[idx_valid])))

score = roc_auc_score(y_train, oof_preds_lgbm)
print("Full AUC score {:.6f}".format(score))


pred_list = []
for model in models:
    pred_list.append(model.predict_proba(x_test)[:, 1])

pred_lgbm = np.mean(np.array(pred_list), axis=0)

submission = sample_sub.copy()
submission['TARGET'] = pred_lgbm

submission.to_csv("submission21.csv", index=False)