In [None]:
import pandas as pd

In [None]:
data_train = pd.read_csv('../../data/credit_scoring_train.csv', index_col='client_id')
data_test = pd.read_csv('../../data/credit_scoring_test.csv', index_col='client_id')

In [None]:
data_train.shape

In [None]:
data_train.head()

In [None]:
data_train.info()

In [None]:
data_train.dropna().shape

In [7]:
data_test.dropna().shape

(60116, 9)

In [8]:
data_train['Income'].fillna(data_train['Income'].median(), inplace=True)
data_train['NumDependents'].fillna(data_train['NumDependents'].median(), inplace=True)

In [9]:
data_test['Income'].fillna(data_train['Income'].median(), inplace=True)
data_test['NumDependents'].fillna(data_test['NumDependents'].median(), inplace=True)

In [10]:
X_train = data_train.drop('Delinquent90', axis=1).values
y_train = data_train['Delinquent90'].values
X_test = data_test.values

In [11]:
import numpy as np
np.bincount(y_train)

array([69987,  5013], dtype=int64)

In [14]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [15]:
X_train_part, X_ho_part, y_train_part, y_ho_part = train_test_split(X_train, y_train, test_size = 0.3, random_state=17)

In [16]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [17]:
from sklearn.metrics import roc_auc_score

In [18]:
logit = LogisticRegression(random_state=17,n_jobs=-1).fit(X_train_part, y_train_part)

In [19]:
first_logit_pred = logit.predict_proba(X_ho_part)[:,1]

In [20]:
roc_auc_score(y_ho_part, first_logit_pred)

0.69003170811692538

In [21]:
scaler = StandardScaler().fit(X_train_part, y_train_part)

In [22]:
X_train_part_scaled = scaler.transform(X_train_part)
X_train_ho_scaled = scaler.transform(X_ho_part)

In [23]:
logit2 = LogisticRegression(random_state=17, n_jobs=-1)

In [24]:
logit2.fit(X_train_part_scaled, y_train_part)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
second_logit_pred = logit2.predict_proba(X_ho_part)[:,1]

  np.exp(prob, prob)


In [26]:
roc_auc_score(y_ho_part, first_logit_pred)

0.69003170811692538

In [27]:
gbm = GradientBoostingClassifier(random_state=17).fit(X_train_part, y_train_part)

In [28]:
first_gbm_pred = gbm.predict_proba(X_ho_part)[:,1]

In [29]:
roc_auc_score(y_ho_part,first_gbm_pred)

0.84142218296116522

In [30]:
from sklearn.model_selection import RandomizedSearchCV
param = {'max_depth': range(2,7), 'min_samples_leaf': range(10,14)}

In [31]:
gbm_grid = RandomizedSearchCV(estimator=gbm, param_distributions=param, n_iter=20, scoring='roc_auc',n_jobs=-1, verbose=True)

In [32]:
%%time
gbm_grid.fit(X_train_part, y_train_part)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  5.1min finished


Wall time: 5min 19s


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=17,
              subsample=1.0, verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=20, n_jobs=-1,
          param_distributions={'max_depth': range(2, 7), 'min_samples_leaf': range(10, 14)},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring='roc_auc', verbose=True)

In [48]:
gbm_grid.best_score_, gbm_grid.best_params_

(0.84615311413426397, {'max_depth': 4, 'min_samples_leaf': 12})

In [36]:
from sklearn.preprocessing import PolynomialFeatures

In [39]:
logit_pipe = Pipeline([('scaler', StandardScaler()), 
                        ('poly', PolynomialFeatures(degree=2)),
                        ('logit', LogisticRegression(random_state=17, n_jobs=-1))])

In [43]:
logit_params = {'logit__C': np.logspace(-2,2,5)}
logit_grid = GridSearchCV(logit_pipe, logit_params, n_jobs = -1)

In [44]:
logit_grid.fit(X_train_part, y_train_part)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('poly', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('logit', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'logit__C': array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [46]:
logit_grid.best_score_, logit_grid.best_params_

(0.93403809523809522, {'logit__C': 10.0})

In [None]:
???
logit_pipe = Pipeline([('scaler', StandardScaler()), 
                        ('poly', PolynomialFeatures(degree=2)),
                        ('logit', LogisticRegression(random_state=17, n_jobs=-1))])

logit_params = {'logit__C': np.logspace(-2,2,5)}
logit_grid = GridSearchCV(logit_pipe2, logit_params, n_jobs = -1, verbose=True).fit(X_train_part, y_train_part)

In [45]:
roc_auc_score(y_ho_part, logit_grid.predict_proba(X_ho_part)[:,1])

0.75812755057277348

In [50]:
gbm_grid.best_estimator__.feature_importance_

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_estimator__'

In [49]:
a = gbm_grid.predict_proba(X_ho_part)[:10]
a.shape

(10, 2)

In [13]:
a

NameError: name 'a' is not defined

In [None]:
poly= PolynomialFeatures(degree=2)
poly