In [1]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

### 匯入資料

In [2]:
dir_data = "./data/"
f_app_train = os.path.join(dir_data,"application_train.csv")
f_app_test = os.path.join(dir_data,"application_test.csv")

app_train = pd.read_csv(f_app_train)
app_test = pd.read_csv(f_app_test)

### Label Encoding

In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le_count = 0

for col in app_train:
    if app_train[col].dtype == "object":
        if len(list(app_train[col].unique())) <=2:
            le.fit(app_train[col])
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            
            le_count += 1
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

### 異常值處理

In [4]:
app_train['DAYS_EMPLOYED'].describe()

count    307511.000000
mean      63815.045904
std      141275.766519
min      -17912.000000
25%       -2760.000000
50%       -1213.000000
75%        -289.000000
max      365243.000000
Name: DAYS_EMPLOYED, dtype: float64

In [5]:
app_train['DAYS_EMPLOYED_ANOM'] = app_train['DAYS_EMPLOYED']==365243
app_train['DAYS_EMPLOYED'].replace({365243:np.nan}, inplace = True)
app_test['DAYS_EMPLOYED_ANOM'] = app_test['DAYS_EMPLOYED']==365243
app_test['DAYS_EMPLOYED'].replace({365243:np.nan}, inplace = True)

In [6]:
app_train['DAYS_BIRTH'].describe()

count    307511.000000
mean     -16036.995067
std        4363.988632
min      -25229.000000
25%      -19682.000000
50%      -15750.000000
75%      -12413.000000
max       -7489.000000
Name: DAYS_BIRTH, dtype: float64

In [7]:
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_test['DAYS_BIRTH'] = abs(app_test['DAYS_BIRTH'])

#### 讓app_train,app_test的欄位數一致

In [8]:
train_labels = app_train['TARGET']

app_train,app_test = app_train.align(app_test, join = 'inner', axis=1)

#### 空缺值填補,標準化

In [10]:
from sklearn.preprocessing import MinMaxScaler,Imputer

train = app_train
features = list(train.columns)

test = app_test

imputer = Imputer(strategy='median')

scaler = MinMaxScaler(feature_range=(0,1))

imputer.fit(train)

train = imputer.transform(train)
test = imputer.transform(test)

scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print('training data shape= ',train.shape)
print('testing data shape= ',test.shape)

training data shape=  (307511, 240)
testing data shape=  (48744, 240)


#### fit model: logistic regression

In [13]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(C = 0.0001, solver='liblinear')

log_reg.fit(train, train_labels)

LogisticRegression(C=0.0001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

### 測試模型

In [19]:
log_reg_pred = log_reg.predict_proba(test)[:,1]

In [20]:
log_reg_pred

array([0.06505115, 0.12640086, 0.08123883, ..., 0.05702381, 0.07413523,
       0.08983229])

### 儲存結果

In [21]:
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = log_reg_pred
submit.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.065051
1,100005,0.126401
2,100013,0.081239
3,100028,0.061509
4,100038,0.128308


In [22]:
submit.to_csv(os.path.join(dir_data,'submit.csv'), index=False)