In [3]:
#!wget https://github.com/dsbattle/hackathon-mkb/raw/master/train_dataset_hackathon_mkb.csv
#!wget https://github.com/dsbattle/hackathon-mkb/raw/master/test_dataset_hackathon_mkb.csv

In [4]:
#pip install category_encoders

In [1]:
import numpy as np
import pandas as pd
from category_encoders import TargetEncoder
from category_encoders import CountEncoder
from datetime import datetime
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict
from collections import deque
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
import multiprocessing
import xgboost as xgb
from copy import deepcopy

  import pandas.util.testing as tm


In [2]:
initial_train = pd.read_csv('./train_dataset_hackathon_mkb.csv', sep=';', encoding='cp1251')
initial_test = pd.read_csv('./test_dataset_hackathon_mkb.csv', sep=';', encoding='cp1251')

In [3]:
columns = initial_train.columns.to_list()
financial_columns = [col for col in columns if col.startswith(('F1', 'F2'))]
discrete_columns = [col for col in columns if ('NUMBER' in col) or ('COUNT' in col) or ('THIRDOROTH' in col) \
                                                                                                            or ('DISQ' in col)]
sum_columns = [col for col in columns if 'SUM' in col]
numeric_columns = financial_columns + discrete_columns + sum_columns
useless_columns = ['id_contract', 'EGRPOINCLUDED', 'TAXREG_REGDATE', 'TAXREGPAY_REGDATE', 'BIRTHDATE', 'AGE']
date_columns = ['SIGN_DATE', 'DATEFIRSTREG']
target_col = ['TARGET']
categorical_few_columns = ['CITIZENSHIP_NAME', 'OKFS_GROUP', 'OKOGU_GROUP', 'OKOPF_GROUP', 'OKTMO_FED', 'SEX_NAME',
                                                                                                'OKATO_FED', 'WORKERSRANGE']
categorical_many_columns = ['OKATO_REGIONCODE', 'OKTMO_CODE', 'OKVED_CODE']
categorical_columns = categorical_few_columns + categorical_many_columns
columns = [col for col in columns if col not in useless_columns]
train_length = len(initial_train)
test_length = len(initial_test)

In [4]:
def preprocess(df):
    result = df.drop(columns = useless_columns)

    for col in date_columns:
        result[col] = [datetime.strptime(date, '%d%b%Y:%H:%M:%S') if not pd.isna(date) \
                                                                                else np.nan for date in list(result[col])]
        
    result['day_of_the_year'] = [sign_date.timetuple().tm_yday for sign_date in result['SIGN_DATE']]

    result['days_registred'] = [(sign_date - reg_date).days for (sign_date, reg_date) in \
                                                                            zip(result['SIGN_DATE'], result['DATEFIRSTREG'])]
    
    result[numeric_columns] = result[numeric_columns].fillna(0)
    
    return result

In [5]:
train = preprocess(initial_train)
prep_test = preprocess(initial_test)

In [6]:
y = train['TARGET']
X = train.copy()

In [7]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.occur = defaultdict(lambda: 0)
        self.sum_last_3_days = defaultdict(lambda: 0)
        self.last_dates = defaultdict(deque)
        self.target_encoder = None
        self.days_registred_mean = None
        self.without_dupl = None
        self.flag = None
    
    def fit(self, train, y = None):
        self.occur = defaultdict(lambda: 0)
        self.sum_last_3_days = defaultdict(lambda: 0)
        self.last_dates = defaultdict(deque)
        self.flag = 0
        train = train.copy()
        
        train['mean_TARGET'] = \
                      train.groupby('id_client').median().loc[:, ['TARGET']].loc[train['id_client']].reset_index()['TARGET']
        
        columns = train.columns.to_list()
        without_dupl = train.drop_duplicates(subset=[col for col in columns if col not in ['SIGN_DATE', 'TARGET',
                                                                            'days_registred', 'day_of_the_year', 'index']], keep='last')
        
        target_enc_columns = categorical_columns
        target_encoder = TargetEncoder(cols = target_enc_columns, handle_missing='value', handle_unknown='value', 
                                                                                             min_samples_leaf=31, smoothing=15)
        target_encoder.fit(X = without_dupl[target_enc_columns], y = np.ceil(without_dupl['mean_TARGET']))
        
        self.target_encoder = target_encoder
        self.days_registred_mean = train['days_registred'].mean()
        self.without_dupl = without_dupl
        return self

        
    def transform(self, test, y = None):
        test = test.copy()
        try:
            test = test.drop(columns = ['mean_TARGET'])
        except KeyError:
            pass
        try:
            test = test.drop(columns = ['TARGET'])
        except KeyError:
            pass
        if self.flag == 0:
            occur = self.occur
            sum_last_3_days = self.sum_last_3_days
            last_dates = self.last_dates
            target_encoder = self.target_encoder
        else:
            occur = deepcopy(self.occur)
            sum_last_3_days =  deepcopy(self.sum_last_3_days) 
            last_dates = deepcopy(self.last_dates) 
            target_encoder = self.target_encoder
        self.flag = 1
        
        test_length = len(test)
        n_before = []
        for i in range(0, test_length):
            sample = test.iloc[i, :]
            client_id = sample['id_client']
            n_before.append(occur[client_id])
            occur[client_id] += 1

        test['n_before'] = n_before
        
        mean_last_3_days = []
        last_3_days = []
        for i in range(0, test_length):
            sample = test.iloc[i, :]
            client_id = sample['id_client']
            date = sample['SIGN_DATE']
            last_dates[client_id].append(date)
            while (date - last_dates[client_id][0]).days > 2 or (date - last_dates[client_id][0]).days < 0:
                last_dates[client_id].popleft()
            last_3_days.append(len(last_dates[client_id]))

            sum_last_3_days[client_id] += len(last_dates[client_id])
            mean_last_3_days.append(sum_last_3_days[client_id] / (n_before[i] + 1))

        test['last_3_days'] = last_3_days
        test['mean_last_3_days'] = mean_last_3_days
        
        test['days_registred'] = test['days_registred'].fillna(self.days_registred_mean)
        test = test.drop(columns = date_columns)
        
        target_enc_columns = categorical_columns
        new_cols = target_encoder.transform(X = test[target_enc_columns])
        dic = {}
        for col in new_cols.columns.to_list():
            dic['targ_enc_' + col] = new_cols[col]
        test = test.assign(**dic)
        test = test.drop(columns = target_enc_columns)
        test = test.drop(columns = ['id_client'])

        return test

In [8]:
#cust = CustomTransformer()

#cust.fit(X)
#new = cust.transform(X)

In [9]:
#second = cust.transform(X)
#third = cust.transform(X)

In [10]:
# max(third['n_before'])

In [11]:
#s = (third == second).all() == False
#s[s].index.to_list()

In [8]:
X_train = X.iloc[:12500, :]
X_val = X.iloc[12500:, :]
y_train = y.iloc[:12500]
y_val = y.iloc[12500:]

In [9]:
w = {0: 1, 1: 1}
#logit = GradientBoostingClassifier(n_estimators=50, learning_rate=0.2, max_depth=3, min_samples_leaf=2, random_state=0, tol=1e-4)
logit = RandomForestClassifier(n_estimators=200, min_samples_leaf=5, random_state=0, class_weight=w, n_jobs=multiprocessing.cpu_count())
#logit = LogisticRegression(penalty='l2', solver='liblinear', C=0.2, class_weight=w, max_iter=10000)

#logit = xgb.XGBClassifier(objective="binary:logistic", random_state=42, n_estimators=200, learning_rate=0.2, max_depth=3)

In [10]:
pipe = Pipeline([('custom', CustomTransformer()), ('scaler', StandardScaler()), ('logit', logit)])

pipe.fit(X_train, y_train)
y_score = pipe.predict_proba(X_val)[:, 1]

roc_auc_score(y_val, y_score)

0.9577973010231075

In [None]:
0.9572867905125969  -- Random Forest

In [11]:
pipe = Pipeline([('custom', CustomTransformer()), ('scaler', StandardScaler()), ('logit', logit)])

pipe.fit(X, y)

Pipeline(steps=[('custom', CustomTransformer()), ('scaler', StandardScaler()),
                ('logit',
                 RandomForestClassifier(class_weight={0: 1, 1: 1},
                                        min_samples_leaf=5, n_estimators=200,
                                        n_jobs=2, random_state=0))])

In [12]:
occ = dict(initial_train['id_client'].value_counts())
validate = X[X['id_client'].isin([client_id for client_id in X['id_client'] if occ[client_id] == 1])]

y_2 = validate['TARGET']
X_2 = validate.drop(columns=['TARGET'])
y_score = pipe.predict_proba(X_2)[:, 1]

roc_auc_score(y_2, y_score)

0.8463324155957194

In [13]:
test_id_contract = initial_test['id_contract']
test = prep_test
submission = pd.DataFrame()
submission['TARGET'] = pipe.predict_proba(test)[:, 1]
submission['id_contract'] = test_id_contract

In [14]:
submission[['id_contract', 'TARGET']].head()                                            # Random Forest

Unnamed: 0,id_contract,TARGET
0,17892,0.026769
1,17893,0.141849
2,17894,0.152393
3,17895,0.260421
4,17896,0.797953


In [15]:
submission[['id_contract', 'TARGET']].to_csv('submit_file_1.csv', sep=';', index=False)   