# Task1.1

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# import data

In [2]:
data = pd.read_csv('data/data_all.csv', encoding='gbk')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4754 entries, 0 to 4753
Data columns (total 85 columns):
low_volume_percent                            4754 non-null float64
middle_volume_percent                         4754 non-null float64
take_amount_in_later_12_month_highest         4754 non-null int64
trans_amount_increase_rate_lately             4754 non-null float64
trans_activity_month                          4754 non-null float64
trans_activity_day                            4754 non-null float64
transd_mcc                                    4754 non-null float64
trans_days_interval_filter                    4754 non-null float64
trans_days_interval                           4754 non-null float64
regional_mobility                             4754 non-null float64
repayment_capability                          4754 non-null int64
is_high_user                                  4754 non-null int64
number_of_trans_from_2011                     4754 non-null float64
first_transacti

In [4]:
#split the feature and label of data
train = data.drop(['status'], axis=1)
label = data['status']

In [5]:
#split the training set and test set
X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.3, random_state=2018)

In [6]:
print(X_train.shape, y_train.shape)

(3327, 84) (3327,)


# Construct Model
## metrics:accuracy, auc
## 1.Logistic Regression

In [7]:
#Default LR
LR_def = LogisticRegression()
LR_def.fit(X_train, y_train)
y_proba_LR_def = LR_def.predict_proba(X_test)[:,1]
y_pred_LR_def = np.where(y_proba_LR_def > 0.5, 1, 0)



In [8]:
accuracy_LR_def = accuracy_score(y_test, y_pred_LR_def)
auc_LR_def = roc_auc_score(y_test, y_pred_LR_def)

In [9]:
print(accuracy_LR_def, auc_LR_def)

0.7484232655921513 0.5


In [10]:
#Tune LR
LR = LogisticRegression()
#parameters to search
param_LR = {'C':[0.001,0.01,0.1,1,10,100,1000],
         'penalty':['l1','l2']}

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
#Grid Search for best parameters
LR_tuned = GridSearchCV(estimator=LR, param_grid=param_LR, cv=10)
LR_tuned.fit(X_train, y_train)









GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=2018, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [13]:
#best parameters
LR_tuned.best_params_

{'C': 10, 'penalty': 'l1'}

In [14]:
y_proba_LR_tuned = LR_tuned.predict_proba(X_test)[:,1]
y_pred_LR_tuned = np.where(y_proba_LR_tuned > 0.5, 1, 0)

In [15]:
accuracy_LR_tuned = accuracy_score(y_test, y_pred_LR_tuned)
auc_LR_tuned = roc_auc_score(y_test, y_pred_LR_tuned)

In [16]:
print(accuracy_LR_tuned, auc_LR_tuned)

0.7834618079887876 0.6297364193087331


# 2.SVM

In [30]:
#Default SVM
svc_def = LinearSVC()
svc_def.fit(X_train, y_train)
y_pred_svc_def = svc_def.predict(X_test)



In [31]:
accuracy_svc_def = accuracy_score(y_test, y_pred_svc_def)
auc_svc_def = roc_auc_score(y_test, y_pred_svc_def)

In [32]:
print(accuracy_svc_def, auc_svc_def)

0.7484232655921513 0.5


In [40]:
#Tune SVM
svc = LinearSVC()
param_svc = {'C':[0.001,0.01,0.1,1,10,100,1000]}

In [41]:
svc_tuned = GridSearchCV(estimator=svc, param_grid=param_svc, cv=10)

In [42]:
svc_tuned.fit(X_train, y_train)





GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [43]:
svc_tuned.best_params_

{'C': 0.01}

In [44]:
y_pred_svc_tuned = svc_tuned.predict(X_test)

In [45]:
accuracy_svc_tuned = accuracy_score(y_test, y_pred_svc_tuned)
auc_svc_tuned = roc_auc_score(y_test, y_pred_svc_tuned)

In [46]:
print(accuracy_svc_tuned, auc_svc_tuned)

0.7484232655921513 0.5


# 3.Decision Tree

In [48]:
#Default DecisionTree
DT_def = DecisionTreeClassifier()
DT_def.fit(X_train, y_train)
y_proba_DT_def = DT_def.predict_proba(X_test)[:,1]
y_pred_DT_def = np.where(y_proba_DT_def>0.5, 1, 0)

In [49]:
accuracy_DT_def = accuracy_score(y_test, y_pred_DT_def)
auc_DT_def = roc_auc_score(y_test, y_pred_DT_def)

In [50]:
print(accuracy_DT_def, auc_DT_def)

0.6706377014716188 0.5756275233951988


In [51]:
#Tune DecisionTree
DT = DecisionTreeClassifier()
param_DT = {'max_features':['auto','sqrt','log2'],
          'min_samples_leaf':range(1,100,1),
          'max_depth':range(1,20,1)}

In [52]:
DT_tuned = GridSearchCV(estimator=DT, param_grid=param_DT, cv=10)

In [53]:
DT_tuned.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_leaf': range(1, 100), 'max_depth': range(1, 20)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [54]:
DT_tuned.best_params_

{'max_depth': 11, 'max_features': 'auto', 'min_samples_leaf': 62}

In [55]:
y_proba_DT_tuned = DT_tuned.predict_proba(X_test)[:,1]
y_pred_DT_tuned = np.where(y_proba_DT_tuned > 0.5, 1, 0)

In [57]:
accuracy_DT_tuned = accuracy_score(y_test, y_pred_DT_tuned)
auc_DT_tuned = roc_auc_score(y_test, y_pred_DT_tuned)

In [58]:
print(accuracy_DT_tuned, auc_DT_tuned)

0.7610371408549405 0.5537320167339572
