In [1]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, Imputer, OneHotEncoder, PolynomialFeatures, OrdinalEncoder
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from time import time
from sys import argv


In [2]:
def in_ipynb():
    try:
        return str(type(get_ipython())) == "<class 'ipykernel.zmqshell.ZMQInteractiveShell'>"
    except NameError:
        return False

# get args
if in_ipynb():
    args = [None, "../data/processed/app_train_processed.csv"]
else:
    args = argv()

training_data_loc = args[1]

# import data
app_train = pd.read_csv(training_data_loc)

In [3]:
print(len(app_train))
app_train.head()

219567


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,AGE,YEARS_EMPLOYED_anom,YEARS_EMPLOYED,CREDIT_INCOME_PERCENT,ANNUITY_INCOME_PERCENT,CREDIT_TERM,YEARS_EMPLOYED_PERCENT
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,0.0,1.0,25,0,1.7452,2.007889,0.121978,0.060749,0.069808
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.0,45,0,3.2548,4.79075,0.132217,0.027598,0.072329
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.0,52,0,0.6164,2.0,0.1,0.05,0.011854
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,,,,52,0,8.326,2.316167,0.2199,0.094941,0.160115
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.0,54,0,8.3233,4.222222,0.179963,0.042623,0.154135


In [4]:
df_train = app_train.copy()

In [None]:
# drop the ID column
df_train = df_train.drop("SK_ID_CURR", axis=1)

# define x and y
y = df_train["TARGET"]
X = df_train.drop("TARGET", axis=1)

# Create a preprocessor for numerical and categorical columns
num_features = X.dtypes[(X.dtypes != object) & (X.dtypes != bool)].index.tolist()
cat_features = X.dtypes[(X.dtypes == object) | (X.dtypes == bool)].index.tolist()

si_X_gs = SimpleImputer(strategy='most_frequent')
X[cat_features] = si_X_gs.fit_transform(X[cat_features])
X = pd.get_dummies(X)

# # Identify cols with nulls for imputation
# cols_w_nulls = X.isnull().sum()[X.isnull().sum() > 0].index.tolist()
# cat_cols_w_nulls = [col for col in cat_features if col in cols_w_nulls]
# num_cols_w_nulls = [col for col in num_features if col in cols_w_nulls]

ohe_cols = list(set(X.columns) - set(num_features))

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())])

# preprocessor 
preprocessor = ColumnTransformer(
    transformers=[
        ("numericals", numeric_transformer, num_features),
        ("ohe_passthrough", "passthrough", ohe_cols),
    ])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.3, random_state=42)

In [None]:
# Logistic Regression
lr_model = Pipeline([("preprocessor", preprocessor), 
                     ("model", LogisticRegression(class_weight="balanced", solver="liblinear", random_state=42))])

# Random Forest
rf_model = Pipeline([("preprocessor", preprocessor), 
                     ("model", RandomForestClassifier(class_weight="balanced", n_estimators=100, n_jobs=-1))])

# XGBoost
xgb_model = Pipeline([("preprocessor", preprocessor), 
                      # Add a scale_pos_weight to make it balanced
                      ("model", XGBClassifier(scale_pos_weight=(1 - y.mean()), n_jobs=-1))])

In [None]:
X_sample = X.sample(100000)

# X_sample = X_sample.append(pd.Series([np.nan for c in X_sample.columns], index=X_sample.columns), ignore_index=True)

times = []
for col in cat_features:
    si = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    start = time()
    si.fit_transform(X_sample[col].values.reshape(-1, 1))
    end = time()
    times.append((end-start))
df_times = pd.concat([pd.Series(cat_features), pd.Series(times)], axis=1)
df_times[1] = df_times[1]/60
df_times.sort_values(1, ascending=False)

In [None]:
start = time()

gs = GridSearchCV(lr_model, {"model__C": [1, 1.3, 1.5]}, n_jobs=-1, cv=5, scoring="accuracy", iid=False)
gs.fit(X_train, y_train)

print(gs.best_params_)
print(gs.best_score_)

lr_model.set_params(**gs.best_params_)
lr_model.get_params("model")

lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

accuracy_score(y_test, y_pred)

print(classification_report(y_test, y_pred))

end = time()
elapsed = end - start
print(elapsed)