# Предсказание

In [7]:
import sklearn
import pandas as pd
import numpy as np
from sklearn import preprocessing
import xgboost as xgb
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import RUSBoostClassifier
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
pd.options.display.float_format ='{:,.3f}'.format

In [5]:
def prepare_data_v1(data):
    prepared_data = pd.get_dummies(data, prefix=['ho', 'p'], columns = ['Home Ownership', 'Purpose'], drop_first=False)
    transform_dict = {'< 1 year':0.5, '10+ years':10, '2 years':2, '3 years':3, '5 years':5, '1 year':1, '4 years':4, '6 years':6, '7 years':7, '8 years':8, '9 years':9}
    prepared_data['Years in current job'] = data['Years in current job'].replace(transform_dict, None)
    prepared_data['Term'] = data['Term'].replace({'Long Term':1, 'Short Term':0}, None)
    prepared_data['Current Loan Amount'] = data['Current Loan Amount'].replace(99999999, np.nan)
    prepared_data['Credit Score'] = data['Credit Score'].apply(lambda x: x/10 if (x > 1000) else x)
    return prepared_data

def fill_na_v1(data):
    prepared_data = data
    prepared_data['Annual Income'] = data['Annual Income'].fillna(data['Annual Income'].mean())
    prepared_data['Current Loan Amount'] = data['Current Loan Amount'].fillna(data['Current Loan Amount'].mean())
    prepared_data['Credit Score'] = data['Credit Score'].fillna(data['Credit Score'].mean())    
    prepared_data['Years in current job'] = data['Years in current job'].fillna(0)
    prepared_data['Bankruptcies'] = data['Bankruptcies'].fillna(data['Number of Credit Problems'].apply(lambda x: 0 if (x == 0) else 1)) 
    prepared_data['Months since last delinquent'] = data['Months since last delinquent'].fillna(data['Months since last delinquent'].mean())
    return prepared_data

def pipeline_v1(data):
    data_prepared = prepare_data_v1(data)
    data_prepared = fill_na_v1(data_prepared)
    return data_prepared

In [3]:
train_data_df = pd.read_csv('data/train.csv')
test_data_df = pd.read_csv('data/test.csv')

Объединим тестовые данные с тренировочными для предобработки.

In [4]:
data_df = train_data_df.append(test_data_df)
data_df

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0.0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1.0
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0.0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0.0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,Home Mortgage,1020053.0,10+ years,0.0,14.0,29.1,559152.0,1.0,68.0,1.0,debt consolidation,Short Term,99999999.0,162735.0,15046.0,745.0,
2496,Home Mortgage,,2 years,0.0,15.0,17.0,1737780.0,0.0,77.0,0.0,debt consolidation,Short Term,468512.0,1439269.0,32996.0,,
2497,Home Mortgage,1171806.0,2 years,0.0,48.0,12.8,1706430.0,0.0,,0.0,debt consolidation,Short Term,430496.0,676438.0,36912.0,695.0,
2498,Rent,723520.0,10+ years,0.0,14.0,28.8,945780.0,0.0,,0.0,debt consolidation,Short Term,257774.0,391248.0,13506.0,744.0,


In [8]:
data_prepared_df = pipeline_v1(data_df)
data_prepared_df.iloc[:,:14].describe()

Unnamed: 0,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,1366524.51,5.667,0.031,11.144,18.32,886507.897,0.168,34.565,0.114,0.258,309988.802,291474.089,18303.676,719.95
std,769233.192,3.693,0.304,4.895,7.095,13899818.885,0.515,14.703,0.349,0.438,171411.57,333997.864,11831.648,24.843
min,106533.0,0.0,0.0,1.0,3.9,0.0,0.0,0.0,0.0,0.0,11242.0,0.0,0.0,585.0
25%,932852.5,2.0,0.0,8.0,13.5,278811.5,0.0,34.565,0.0,0.0,180548.5,113225.75,10059.75,715.0
50%,1366524.51,6.0,0.0,10.0,17.0,478181.0,0.0,34.565,0.0,0.0,309988.802,209019.0,16200.0,719.95
75%,1498297.25,10.0,0.0,14.0,22.0,794359.5,0.0,34.565,0.0,1.0,397342.0,361950.0,23888.25,738.0
max,14975610.0,10.0,15.0,48.0,57.7,1304726170.0,15.0,118.0,5.0,1.0,789096.0,6506797.0,136679.0,751.0


In [26]:
target = 'Credit Default'
X_test = np.array(data_prepared_df.iloc[7500:, data_prepared_df.columns != target])

train_data_df = data_prepared_df.iloc[:7500,:]
X_train = np.array(train_data_df.iloc[:, train_data_df.columns != target])
y_train = np.array(train_data_df[target].astype(int))

In [27]:
model = RUSBoostClassifier(n_estimators=40, learning_rate=0.6, algorithm='SAMME.R', sampling_strategy='auto', replacement=False, random_state=None)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [40]:
predict_df = pd.DataFrame(y_pred)
predict_df.to_csv('predict.csv', index=False)
predict_df.value_counts()

0    1270
1    1230
dtype: int64