In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, roc_curve, roc_auc_score

In [24]:
df_train = pd.read_csv("../data/application_train.csv")
df_test = pd.read_csv("../data/application_test.csv")

In [11]:
df_x = df_train.iloc[:, 2:]
df_y = df_train['TARGET']
df_x_train, df_x_test, df_y_train, df_y_test = train_test_split(df_x, df_y)

In [49]:
FeatureByMissing = df_x_train.columns[df_x_train.isnull().sum()/df_x_train.shape[0] < 0.3]
df_x_train = df_x_train[FeatureByMissing]
df_x_test = df_x_test[FeatureByMissing]

In [50]:
df_x_train = pd.get_dummies(df_x_train) 
# 先用pandas自带的独热编码进行编码，然后再填补缺失值
# 缺点：多分类变量有可能几个虚拟变量都为1

In [51]:
XTrainColumn = df_x_train.columns

In [54]:
x_train = df_x_train.values

In [None]:
# 异常值
pass

In [62]:
# 缺失值
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer.fit(x_train)
x_train = imputer.transform(x_train)

In [64]:
# 标准化
ss = StandardScaler()
ss.fit(x_train)
x_train = ss.transform(x_train)

In [34]:
class DataHandle:
    def __init__(self, missing_rate=0.3):
        self.missing_rate = missing_rate
        self.clear()
    
    def clear(self):
        self.imputer = None
        self.ss = None
        self.sfm = None
        self.x_before = None
        self.x_after = None
        self.feature_by_missing = None
        
    def fit(self, x, y):
        self.clear()
        if not isinstance(x, pd.DataFrame):
            raise TypeError
        self.x_before = x
        self.y_before = y
        self.y_after = y
        self.feature_by_missing = x.columns[x.isnull().sum()/x.shape[0] < self.missing_rate]
        self.x_before = self.x_before[self.feature_by_missing]
        self.x_before = pd.get_dummies(self.x_before)
        self.x_after = self.data_preprocess(self.x_before.values, y)
        
    def data_preprocess(self, x, y=None):
        assert isinstance(x, np.ndarray)
        # 异常值
        pass

        # 缺失值
        if self.imputer is None:
            self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
            self.imputer.fit(x)
        x =  self.imputer.transform(x)

        # 标准化
        if self.ss is None:
            self.ss = StandardScaler()
            self.ss.fit(x)
        x = self.ss.transform(x)

        # 特征选择
        if self.sfm is None:
            self.sfm = SelectFromModel(LogisticRegression(penalty="l1", C=0.01))
            self.sfm.fit(x, y)
        x = self.sfm.transform(x)

        return x

    def transform(self, x_test):
        x_test = x_test[self.feature_by_missing]
        x_test = pd.get_dummies(x_test)
        # 统一train和test的特征
        _, x_test = self.x_before.align(x_test, join='left', axis=1, fill_value=0)
        x_test = self.data_preprocess(x_test.values)
        return self.x_after, self.y_after, x_test


def get_auc_score(y_true, y_predict_proba):
    f, t, _ = roc_curve(y_true, y_predict_proba, pos_label=1)
    return auc(f, t)

In [12]:
dh = DataHandle()
dh.fit(df_x_train, df_y_train)
x_train, y_train, x_test = dh.transform(df_x_test)

In [17]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
get_auc_score(df_y_test, lr.predict_proba(x_test)[:, 1])

0.7396479831646039

In [30]:
df_train = pd.read_csv("../data/application_train.csv")
df_test = pd.read_csv("../data/application_test.csv")

In [39]:
x_train = df_train.iloc[:, 2:]
y_train = df_train['TARGET'].values

x_test = df_test.iloc[:, 1:]
test_id  = df_test.iloc[:, 0]

In [40]:
#dh = DataHandle()
#dh.fit(x_train, y_train)
x_train, y_train, x_test = dh.transform(x_test)

lr = LogisticRegression()
lr.fit(x_train, y_train)
y_predict_prob = lr.predict_proba(x_test)

In [48]:
result = pd.DataFrame(np.column_stack((test_id, y_predict_prob[:, 1])))
result.columns = ['SK_ID_CURR', 'TARGET']
result['SK_ID_CURR'] = result['SK_ID_CURR'].astype('int')
result.to_csv('submission.csv', header=True, index=False)