In [1]:
import warnings
from tqdm import tqdm
from typing import List, Tuple

import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
import matplotlib.pyplot as plt
from scipy.stats import ttest_rel

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.base import BaseEstimator, TransformerMixin

from scipy.sparse import hstack
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import FeatureUnion
from typing import List, Optional



from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_val_score
warnings.simplefilter("ignore")
%matplotlib inline

from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

In [2]:
train = pd.read_csv('../lesson_03/data/assignment_2_train.csv')
test = pd.read_csv('../lesson_03/data/assignment_2_test.csv')

In [3]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]

In [4]:
def feature_selector(X):
    numerical_features = X.select_dtypes(include=[np.number])
    numerical_features = numerical_features.columns.tolist()
    
    cat_features = X.select_dtypes(include=[np.object])
    cat_features = cat_features.columns.tolist()
    
    dummies_features = pd.get_dummies(X[cat_features])
    X = pd.concat([X, dummies_features], axis=1)
    dummies_features = dummies_features.columns.tolist()
    X = X.drop(cat_features, axis=1)
    return X

In [5]:
def time_feature(df, origin='2017-12-01'):
    df['TransactionDT'] = pd.to_datetime(df['TransactionDT'], unit='s', origin=origin)
    
    df["year"] = df["TransactionDT"].dt.year
    df["month_of_year"] = df["TransactionDT"].dt.month
    df["day_of_week"] = df["TransactionDT"].dt.weekday
    df["hour_of_day"] = df["TransactionDT"].dt.hour
    df["day_of_year"] = df["TransactionDT"].dt.day
    return df

In [6]:
def create_numerical_aggs(data: pd.DataFrame,
                          groupby_id: str,
                          aggs: dict,
                          prefix: Optional[str] = None,
                          suffix: Optional[str] = None,
                          ) -> pd.DataFrame:
    """
    Построение агрегаций для числовых признаков.

    Parameters
    ----------
    data: pandas.core.frame.DataFrame
        Выборка для построения агрегаций.

    groupby_id: str
        Название ключа, по которому нужно произвести группировку.

    aggs: dict
        Словарь с названием признака и списка функций.
        Ключ словаря - название признака, который используется для
        вычисления агрегаций, значение словаря - список с названием
        функций для вычисления агрегаций.

    prefix: str, optional, default = None
        Префикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    suffix: str, optional, default = None
        Суффикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    stats: pandas.core.frame.DataFrame
        Выборка с рассчитанными агрегациями.

    """
    if not prefix:
        prefix = ""
    if not suffix:
        suffix = ""

    data_grouped = data.groupby(groupby_id)
    stats = data_grouped.agg(aggs)
    stats.columns = [f"{prefix}{feature}_{stat}{suffix}".upper() for feature, stat in stats]
    stats = stats.reset_index()

    return stats

In [7]:
X = train.drop(['isFraud'], axis=1)
y = train['isFraud']
print("data.shape = {} rows, {} cols".format(*X.shape))
X_leader_board = test.drop(['isFraud'], axis=1)
y_leader_board = test['isFraud']
print("test.shape = {} rows, {} cols".format(*test.shape))

data.shape = 180000 rows, 393 cols
test.shape = 100001 rows, 394 cols


In [8]:
X = feature_selector(X)
X_leader_board = feature_selector(X_leader_board)
drop_features = list(set(X) - set(X_leader_board))
X = X.drop(drop_features, axis=1)

In [9]:
result = []

In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Columns: 526 entries, TransactionID to M9_T
dtypes: float64(376), int64(3), uint8(147)
memory usage: 545.7 MB


In [11]:
x_train, x_valid, y_train, y_valid = train_test_split(X, 
                                                    y, test_size=0.3, random_state=0)

Задание 0: выбрать любую модель машнного обучения и зафиксировать любой тип валидации. Обучить базовую модель и зафиксировать базовое качество модели. В каждом следующем задании нужно будет обучить выбранную модель и оценивать ее качество на зафиксированной схеме валидации. После каждого задания, требуется сделать вывод о достигаемом качестве модели, по сравнению с качестом из предыдущего шага.

In [12]:
cb_params = {
    "n_estimators": 10000,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "learning_rate":0.01,
    "max_bin": 40,
    "verbose": 200,
    "max_depth": 7,
    "l2_leaf_reg": 200,
    "early_stopping_rounds": 50,
    "thread_count": -1,
    "random_seed": 42
}


In [13]:
%%time

model = cat.CatBoostClassifier(**cb_params)
model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)])

0:	test: 0.6562053	test1: 0.6451519	best: 0.6451519 (0)	total: 104ms	remaining: 17m 19s
200:	test: 0.8635245	test1: 0.8544291	best: 0.8544291 (200)	total: 10.5s	remaining: 8m 29s
400:	test: 0.8742339	test1: 0.8665657	best: 0.8665657 (400)	total: 21.1s	remaining: 8m 25s
600:	test: 0.8832549	test1: 0.8764604	best: 0.8764604 (600)	total: 32.1s	remaining: 8m 22s
800:	test: 0.8901276	test1: 0.8839167	best: 0.8839167 (800)	total: 43s	remaining: 8m 13s
1000:	test: 0.8939753	test1: 0.8876712	best: 0.8876712 (1000)	total: 53.6s	remaining: 8m 2s
1200:	test: 0.8967489	test1: 0.8901686	best: 0.8901686 (1200)	total: 1m 4s	remaining: 7m 48s
1400:	test: 0.8996663	test1: 0.8923434	best: 0.8923434 (1400)	total: 1m 14s	remaining: 7m 36s
1600:	test: 0.9019382	test1: 0.8940234	best: 0.8940234 (1600)	total: 1m 24s	remaining: 7m 24s
1800:	test: 0.9032742	test1: 0.8949742	best: 0.8949742 (1800)	total: 1m 34s	remaining: 7m 11s
2000:	test: 0.9041266	test1: 0.8956675	best: 0.8956675 (2000)	total: 1m 44s	remaini

<catboost.core.CatBoostClassifier at 0x7f0627357c40>

In [14]:
train_score = roc_auc_score(y_train, model.predict(x_train))
valid_score = roc_auc_score(y_valid, model.predict(x_valid))
leader_board = roc_auc_score(y_leader_board, model.predict(X_leader_board))

print(f"Train-score: {round(train_score, 3)}, Validation-score: {round(valid_score, 3)}, Leader_board-score: {round(leader_board, 3)}")

result.append({"method":"cat_boost_basic" ,"Train-score" : round(train_score, 3), "Validation-score" : round(valid_score, 3),
               "Leader_board-score" : round(leader_board, 3)    })

Train-score: 0.729, Validation-score: 0.694, Leader_board-score: 0.649


### Задание 1: признак TransactionDT - это смещение в секундах относительно базовой даты. Базовая дата - 2017-12-01, преобразовать признак TransactionDT в datetime, прибавив к базовой дате исходное значение признака. Из полученного признака выделить год, месяц, день недели, час, день.

In [15]:
time_feature(X)
time_feature(X_leader_board)

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,M7_T,M8_F,M8_T,M9_F,M9_T,year,month_of_year,day_of_week,hour_of_day,day_of_year
0,3287000,2018-02-24 19:43:58,226.000,12473,555.0,150.0,226.0,299.0,87.0,116.0,...,0,1,0,0,1,2018,2,5,19,24
1,3287001,2018-02-24 19:44:14,3072.000,15651,417.0,150.0,226.0,330.0,87.0,,...,0,0,0,0,0,2018,2,5,19,24
2,3287002,2018-02-24 19:44:41,319.950,13844,583.0,150.0,226.0,126.0,87.0,9.0,...,0,1,0,0,1,2018,2,5,19,24
3,3287003,2018-02-24 19:45:11,171.000,11556,309.0,150.0,226.0,181.0,87.0,3.0,...,0,0,1,0,1,2018,2,5,19,24
4,3287004,2018-02-24 19:45:12,107.950,10985,555.0,150.0,226.0,231.0,87.0,0.0,...,1,0,1,0,1,2018,2,5,19,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99996,3386996,2018-03-27 19:12:08,368.990,13964,496.0,150.0,224.0,299.0,87.0,,...,0,1,0,0,1,2018,3,1,19,27
99997,3386997,2018-03-27 19:12:13,445.330,10616,583.0,150.0,226.0,472.0,87.0,,...,0,0,0,0,0,2018,3,1,19,27
99998,3386998,2018-03-27 19:12:24,15.226,9803,583.0,150.0,226.0,,,,...,0,0,0,0,0,2018,3,1,19,27
99999,3386999,2018-03-27 19:12:29,34.742,16062,500.0,185.0,137.0,284.0,60.0,,...,0,0,0,0,0,2018,3,1,19,27


In [16]:
x_train, x_valid, y_train, y_valid = train_test_split(X, 
                                                    y, test_size=0.3, random_state=0)

In [17]:
%%time

model = cat.CatBoostClassifier(**cb_params)
model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)])

0:	test: 0.6860462	test1: 0.6655538	best: 0.6655538 (0)	total: 48.7ms	remaining: 8m 6s
200:	test: 0.8610950	test1: 0.8512871	best: 0.8513444 (198)	total: 9.6s	remaining: 7m 48s
400:	test: 0.8734436	test1: 0.8651453	best: 0.8651453 (400)	total: 20s	remaining: 7m 59s
600:	test: 0.8833901	test1: 0.8766997	best: 0.8766997 (600)	total: 31s	remaining: 8m 4s
800:	test: 0.8899149	test1: 0.8834177	best: 0.8834177 (800)	total: 42s	remaining: 8m 2s
1000:	test: 0.8939240	test1: 0.8874060	best: 0.8874060 (1000)	total: 52.2s	remaining: 7m 48s
1200:	test: 0.8971215	test1: 0.8903279	best: 0.8903279 (1200)	total: 1m 2s	remaining: 7m 34s
1400:	test: 0.8999983	test1: 0.8925527	best: 0.8925527 (1400)	total: 1m 11s	remaining: 7m 21s
1600:	test: 0.9022965	test1: 0.8942037	best: 0.8942038 (1599)	total: 1m 21s	remaining: 7m 8s
1800:	test: 0.9028077	test1: 0.8945693	best: 0.8945694 (1799)	total: 1m 30s	remaining: 6m 52s
2000:	test: 0.9038867	test1: 0.8952456	best: 0.8952456 (2000)	total: 1m 39s	remaining: 6m 3

<catboost.core.CatBoostClassifier at 0x7f06276171f0>

In [18]:
train_score = roc_auc_score(y_train, model.predict(x_train))
valid_score = roc_auc_score(y_valid, model.predict(x_valid))
leader_board = roc_auc_score(y_leader_board, model.predict(X_leader_board))

print(f"Train-score: {round(train_score, 3)}, Validation-score: {round(valid_score, 3)}, Leader_board-score: {round(leader_board, 3)}")

result.append({"method":"cat_boost_add_datetime_features" ,"Train-score" : round(train_score, 3), "Validation-score" : round(valid_score, 3),
               "Leader_board-score" : round(leader_board, 3)    })

Train-score: 0.705, Validation-score: 0.674, Leader_board-score: 0.644


In [19]:
models_results = pd.DataFrame(result)
pd.pivot_table(models_results, columns = 'method')

method,cat_boost_add_datetime_features,cat_boost_basic
Leader_board-score,0.644,0.649
Train-score,0.705,0.729
Validation-score,0.674,0.694


Вывод: Добавив признаки на основе времени точность модели немного улучшилась

### Задание 2: сделать конкатенацию признаков
* card1 + card2;
* card1 + card2 + card_3 + card_5;
* card1 + card2 + card_3 + card_5 + addr1 + addr2

Рассматривать их как категориальных признаки.

In [20]:
card1_2 = X[["card1", "card2"]].astype('str')
X["card1_2"] = card1_2["card1"] + " | " + card1_2["card2"].astype('str')
card1_2_lb = X_leader_board[["card1", "card2"]].astype('str')
X_leader_board['card1_2'] = card1_2_lb["card1"] + " | " + card1_2_lb["card2"].astype('str')


In [21]:
card1_2_3_5 = X[["card1", "card2", "card3", "card5"]].astype('str')
X["card1_2_3_5"] = card1_2_3_5["card1"] + " | " + card1_2_3_5["card2"] \
    + " | " + card1_2_3_5["card3"] + " | " + card1_2_3_5["card3"].astype('str')

card1_2_3_5_lb = X_leader_board[["card1", "card2", "card3", "card5"]].astype('str')
X_leader_board["card1_2_3_5"] = card1_2_3_5_lb["card1"] + " | " + card1_2_3_5_lb["card2"] \
    + " | " + card1_2_3_5_lb["card3"] + " | " + card1_2_3_5_lb["card3"].astype('str')


    

In [22]:
card1_2_3_5_adr1_2 = X[["card1", "card2", "card3", "card5", "addr1", "addr2"]].astype('str')
X["card1_2_3_5_adr1_2"] = card1_2_3_5_adr1_2["card1"] + " | " + card1_2_3_5_adr1_2["card2"] \
    + " | " + card1_2_3_5_adr1_2["card3"] + " | " + card1_2_3_5_adr1_2["card5"] \
    + " | " + card1_2_3_5_adr1_2["addr1"] + " | " + card1_2_3_5_adr1_2["addr2"].astype('str')

card1_2_3_5_adr1_2_lb = X_leader_board[["card1", "card2", "card3", "card5", "addr1", "addr2"]].astype('str')
X_leader_board["card1_2_3_5_adr1_2"] = card1_2_3_5_adr1_2_lb["card1"] + " | " + card1_2_3_5_adr1_2_lb["card2"] \
    + " | " + card1_2_3_5_adr1_2_lb["card3"] + " | " + card1_2_3_5_adr1_2_lb["card5"] \
    + " | " + card1_2_3_5_adr1_2_lb["addr1"] + " | " + card1_2_3_5_adr1_2_lb["addr2"].astype('str')




In [23]:
cat_features = ('card1_2','card1_2_3_5', 'card1_2_3_5_adr1_2' )

In [24]:
x_train, x_valid, y_train, y_valid = train_test_split(X, 
                                                    y, test_size=0.3, random_state=0)

In [27]:
%%time

model = cat.CatBoostClassifier(**cb_params)
model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)], cat_features=cat_features)

0:	test: 0.7032132	test1: 0.6845214	best: 0.6845214 (0)	total: 97.8ms	remaining: 16m 17s
200:	test: 0.9219875	test1: 0.8876085	best: 0.8876085 (200)	total: 19.5s	remaining: 15m 52s
400:	test: 0.9342014	test1: 0.8986551	best: 0.8986551 (400)	total: 38.3s	remaining: 15m 17s
600:	test: 0.9370027	test1: 0.9029452	best: 0.9029452 (600)	total: 55.8s	remaining: 14m 32s
800:	test: 0.9417926	test1: 0.9103853	best: 0.9103857 (799)	total: 1m 14s	remaining: 14m 19s
1000:	test: 0.9455595	test1: 0.9156196	best: 0.9156196 (1000)	total: 1m 34s	remaining: 14m 13s
1200:	test: 0.9485047	test1: 0.9190515	best: 0.9190515 (1200)	total: 1m 54s	remaining: 13m 59s
1400:	test: 0.9498937	test1: 0.9206993	best: 0.9206997 (1399)	total: 2m 13s	remaining: 13m 37s
1600:	test: 0.9514025	test1: 0.9224524	best: 0.9224530 (1599)	total: 2m 31s	remaining: 13m 13s
1800:	test: 0.9525374	test1: 0.9235580	best: 0.9235580 (1800)	total: 2m 48s	remaining: 12m 45s
2000:	test: 0.9527434	test1: 0.9237978	best: 0.9237979 (1999)	total

<catboost.core.CatBoostClassifier at 0x7f062759de50>

In [28]:
train_score = roc_auc_score(y_train, model.predict(x_train))
valid_score = roc_auc_score(y_valid, model.predict(x_valid))
leader_board = roc_auc_score(y_leader_board, model.predict(X_leader_board))

print(f"Train-score: {round(train_score, 3)}, Validation-score: {round(valid_score, 3)}, Leader_board-score: {round(leader_board, 3)}")

result.append({"method":"cat_boost_add_datetime_features_and_card_addr_features_" ,"Train-score" : round(train_score, 3), "Validation-score" : round(valid_score, 3),
               "Leader_board-score" : round(leader_board, 3)    })

Train-score: 0.809, Validation-score: 0.77, Leader_board-score: 0.656


In [29]:
models_results = pd.DataFrame(result)
pd.pivot_table(models_results, columns = 'method')

method,cat_boost_add_datetime_features,cat_boost_add_datetime_features_and_card_addr_features_,cat_boost_basic
Leader_board-score,0.644,0.656,0.649
Train-score,0.705,0.809,0.729
Validation-score,0.674,0.77,0.694


### Задание 3: Сделать FrequencyEncoder для признаков card1 - card6, addr1, addr2.

In [30]:
features_list = ('card1', 'card2', 'card3', 'card5', 'addr1', 'addr2')

def freq_encoder(df,features_list):
    
    for item in features_list:
        freq_encoder = df[item].value_counts(normalize=True)
        df[item+"_freq_enc"] = df[item].map(freq_encoder)
    
    return df.head(2)


In [31]:
freq_encoder(X, features_list)
freq_encoder(X_leader_board, features_list)


Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,day_of_year,card1_2,card1_2_3_5,card1_2_3_5_adr1_2,card1_freq_enc,card2_freq_enc,card3_freq_enc,card5_freq_enc,addr1_freq_enc,addr2_freq_enc
0,3287000,2018-02-24 19:43:58,226.0,12473,555.0,150.0,226.0,299.0,87.0,116.0,...,24,12473 | 555.0,12473 | 555.0 | 150.0 | 150.0,12473 | 555.0 | 150.0 | 226.0 | 299.0 | 87.0,0.00032,0.072942,0.881635,0.495638,0.083024,0.994213
1,3287001,2018-02-24 19:44:14,3072.0,15651,417.0,150.0,226.0,330.0,87.0,,...,24,15651 | 417.0,15651 | 417.0 | 150.0 | 150.0,15651 | 417.0 | 150.0 | 226.0 | 330.0 | 87.0,0.00269,0.004261,0.881635,0.495638,0.04889,0.994213


In [32]:
x_train, x_valid, y_train, y_valid = train_test_split(X, 
                                                    y, test_size=0.3, random_state=0)

In [33]:
%%time

model = cat.CatBoostClassifier(**cb_params)
model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)], cat_features=cat_features)

0:	test: 0.7182096	test1: 0.7101369	best: 0.7101369 (0)	total: 99.4ms	remaining: 16m 33s
200:	test: 0.9218346	test1: 0.8875267	best: 0.8875267 (200)	total: 19.5s	remaining: 15m 52s
400:	test: 0.9343777	test1: 0.8979231	best: 0.8979231 (400)	total: 38.2s	remaining: 15m 13s
600:	test: 0.9380932	test1: 0.9036814	best: 0.9036814 (600)	total: 56.1s	remaining: 14m 37s
800:	test: 0.9424819	test1: 0.9107755	best: 0.9107755 (800)	total: 1m 15s	remaining: 14m 29s
1000:	test: 0.9473227	test1: 0.9171536	best: 0.9171536 (1000)	total: 1m 36s	remaining: 14m 28s
1200:	test: 0.9495799	test1: 0.9199671	best: 0.9199671 (1200)	total: 1m 56s	remaining: 14m 13s
1400:	test: 0.9510131	test1: 0.9216637	best: 0.9216637 (1400)	total: 2m 15s	remaining: 13m 49s
1600:	test: 0.9532008	test1: 0.9242086	best: 0.9242086 (1600)	total: 2m 34s	remaining: 13m 29s
1800:	test: 0.9549902	test1: 0.9258934	best: 0.9258934 (1800)	total: 2m 52s	remaining: 13m 4s
2000:	test: 0.9552713	test1: 0.9260427	best: 0.9260429 (1999)	total:

<catboost.core.CatBoostClassifier at 0x7f06275d85b0>

In [34]:
train_score = roc_auc_score(y_train, model.predict(x_train))
valid_score = roc_auc_score(y_valid, model.predict(x_valid))
leader_board = roc_auc_score(y_leader_board, model.predict(X_leader_board))

print(f"Train-score: {round(train_score, 3)}, Validation-score: {round(valid_score, 3)}, Leader_board-score: {round(leader_board, 3)}")

result.append({"method":"cat_boost_all_before_and_freq_enc" ,"Train-score" : round(train_score, 3), "Validation-score" : round(valid_score, 3),
               "Leader_board-score" : round(leader_board, 3)    })

Train-score: 0.815, Validation-score: 0.774, Leader_board-score: 0.658


In [35]:
models_results = pd.DataFrame(result)
pd.pivot_table(models_results, columns = 'method')

method,cat_boost_add_datetime_features,cat_boost_add_datetime_features_and_card_addr_features_,cat_boost_all_before_and_freq_enc,cat_boost_basic
Leader_board-score,0.644,0.656,0.658,0.649
Train-score,0.705,0.809,0.815,0.729
Validation-score,0.674,0.77,0.774,0.694


### Задание 4: Создать признаки на основе отношения: TransactionAmt к вычисленной статистике. Статистика - среднее значение / стандартное отклонение TransactionAmt, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.

In [36]:
features_list = ['card1', 'card2', 'card3', 'card5','addr1', 'addr2', 
            'card1_2', 'card1_2_3_5', 'card1_2_3_5_adr1_2']

In [37]:
def group_by_stat(df,features_list, feature):
    
    for item in features_list:
        df[item+"_mean-"+feature] = df.groupby(item)[feature].transform('mean')
        df[item+"_std_"+feature] = df.groupby(item)[feature].transform('std')
    
    return df.head(2)

In [38]:
group_by_stat(X,features_list, "TransactionAmt")
group_by_stat(X_leader_board,features_list, "TransactionAmt")

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,addr1_mean-TransactionAmt,addr1_std_TransactionAmt,addr2_mean-TransactionAmt,addr2_std_TransactionAmt,card1_2_mean-TransactionAmt,card1_2_std_TransactionAmt,card1_2_3_5_mean-TransactionAmt,card1_2_3_5_std_TransactionAmt,card1_2_3_5_adr1_2_mean-TransactionAmt,card1_2_3_5_adr1_2_std_TransactionAmt
0,3287000,2018-02-24 19:43:58,226.0,12473,555.0,150.0,226.0,299.0,87.0,116.0,...,189.374905,291.457052,154.076553,252.968071,280.96875,62.942106,280.96875,62.942106,280.96875,62.942106
1,3287001,2018-02-24 19:44:14,3072.0,15651,417.0,150.0,226.0,330.0,87.0,,...,131.025545,216.060033,154.076553,252.968071,108.769588,244.454676,108.769588,244.454676,107.802984,252.073024


In [39]:
x_train, x_valid, y_train, y_valid = train_test_split(X, 
                                                    y, test_size=0.3, random_state=0)

In [40]:
%%time

model = cat.CatBoostClassifier(**cb_params)
model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)], cat_features=cat_features)

0:	test: 0.6286323	test1: 0.6279711	best: 0.6279711 (0)	total: 114ms	remaining: 19m 2s
200:	test: 0.9221372	test1: 0.8858118	best: 0.8858118 (200)	total: 19.5s	remaining: 15m 48s
400:	test: 0.9346649	test1: 0.8982905	best: 0.8982905 (400)	total: 38.9s	remaining: 15m 30s
600:	test: 0.9373489	test1: 0.9017753	best: 0.9017753 (600)	total: 56.6s	remaining: 14m 45s
800:	test: 0.9422360	test1: 0.9096473	best: 0.9096473 (800)	total: 1m 16s	remaining: 14m 40s
1000:	test: 0.9470031	test1: 0.9163898	best: 0.9163898 (1000)	total: 1m 37s	remaining: 14m 37s
1200:	test: 0.9497752	test1: 0.9196656	best: 0.9196656 (1200)	total: 1m 57s	remaining: 14m 23s
1400:	test: 0.9516785	test1: 0.9216360	best: 0.9216360 (1400)	total: 2m 16s	remaining: 13m 59s
1600:	test: 0.9537241	test1: 0.9236337	best: 0.9236337 (1600)	total: 2m 34s	remaining: 13m 32s
1800:	test: 0.9562973	test1: 0.9256943	best: 0.9256943 (1800)	total: 2m 52s	remaining: 13m 4s
2000:	test: 0.9570505	test1: 0.9261939	best: 0.9261939 (2000)	total: 3

<catboost.core.CatBoostClassifier at 0x7f05baa62cd0>

In [41]:
train_score = roc_auc_score(y_train, model.predict(x_train))
valid_score = roc_auc_score(y_valid, model.predict(x_valid))
leader_board = roc_auc_score(y_leader_board, model.predict(X_leader_board))

print(f"Train-score: {round(train_score, 3)}, Validation-score: {round(valid_score, 3)}, Leader_board-score: {round(leader_board, 3)}")

result.append({"method":"cat_boost_all_before_and_stats" ,"Train-score" : round(train_score, 3), "Validation-score" : round(valid_score, 3),
               "Leader_board-score" : round(leader_board, 3)    })

Train-score: 0.794, Validation-score: 0.754, Leader_board-score: 0.65


In [42]:
models_results = pd.DataFrame(result)
models_results

Unnamed: 0,method,Train-score,Validation-score,Leader_board-score
0,cat_boost_basic,0.729,0.694,0.649
1,cat_boost_add_datetime_features,0.705,0.674,0.644
2,cat_boost_add_datetime_features_and_card_addr_...,0.809,0.77,0.656
3,cat_boost_all_before_and_freq_enc,0.815,0.774,0.658
4,cat_boost_all_before_and_stats,0.794,0.754,0.65


### Задание 5: Создать признаки на основе отношения: D15 к вычисленной статистике. Статистика - среднее значение / стандартное отклонение D15, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.

In [43]:
group_by_stat(X,features_list, 'D15')
group_by_stat(X_leader_board,features_list, 'D15')

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,addr1_mean-D15,addr1_std_D15,addr2_mean-D15,addr2_std_D15,card1_2_mean-D15,card1_2_std_D15,card1_2_3_5_mean-D15,card1_2_3_5_std_D15,card1_2_3_5_adr1_2_mean-D15,card1_2_3_5_adr1_2_std_D15
0,3287000,2018-02-24 19:43:58,226.0,12473,555.0,150.0,226.0,299.0,87.0,116.0,...,160.912203,203.203252,171.017543,205.442135,3.96875,1.121185,3.96875,1.121185,3.96875,1.121185
1,3287001,2018-02-24 19:44:14,3072.0,15651,417.0,150.0,226.0,330.0,87.0,,...,180.925192,209.841683,171.017543,205.442135,185.60177,205.670007,185.60177,205.670007,181.763285,208.448282


In [44]:
x_train, x_valid, y_train, y_valid = train_test_split(X, 
                                                    y, test_size=0.3, random_state=0)

In [45]:
%%time

model = cat.CatBoostClassifier(**cb_params)
model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)], cat_features=cat_features)

0:	test: 0.6581651	test1: 0.6483895	best: 0.6483895 (0)	total: 119ms	remaining: 19m 47s
200:	test: 0.9273465	test1: 0.8887133	best: 0.8887133 (200)	total: 21.7s	remaining: 17m 37s
400:	test: 0.9363294	test1: 0.8994561	best: 0.8994561 (400)	total: 41.3s	remaining: 16m 28s
600:	test: 0.9403101	test1: 0.9048979	best: 0.9048979 (600)	total: 1m	remaining: 15m 46s
800:	test: 0.9448871	test1: 0.9126221	best: 0.9126221 (800)	total: 1m 21s	remaining: 15m 36s
1000:	test: 0.9487788	test1: 0.9178458	best: 0.9178458 (1000)	total: 1m 42s	remaining: 15m 23s
1200:	test: 0.9510762	test1: 0.9205168	best: 0.9205168 (1200)	total: 2m 2s	remaining: 14m 59s
1400:	test: 0.9528528	test1: 0.9223585	best: 0.9223585 (1400)	total: 2m 21s	remaining: 14m 30s
1600:	test: 0.9547091	test1: 0.9241155	best: 0.9241155 (1600)	total: 2m 39s	remaining: 13m 58s
1800:	test: 0.9570749	test1: 0.9263378	best: 0.9263378 (1800)	total: 2m 58s	remaining: 13m 32s
2000:	test: 0.9588973	test1: 0.9276371	best: 0.9276371 (2000)	total: 3m 

<catboost.core.CatBoostClassifier at 0x7f05d19c21f0>

In [46]:
train_score = roc_auc_score(y_train, model.predict(x_train))
valid_score = roc_auc_score(y_valid, model.predict(x_valid))
leader_board = roc_auc_score(y_leader_board, model.predict(X_leader_board))

print(f"Train-score: {round(train_score, 3)}, Validation-score: {round(valid_score, 3)}, Leader_board-score: {round(leader_board, 3)}")

result.append({"method":"cat_boost_all_before_and_stats_d15" ,"Train-score" : round(train_score, 3), "Validation-score" : round(valid_score, 3),
               "Leader_board-score" : round(leader_board, 3)    })

Train-score: 0.816, Validation-score: 0.775, Leader_board-score: 0.653


In [47]:
models_results = pd.DataFrame(result)
models_results

Unnamed: 0,method,Train-score,Validation-score,Leader_board-score
0,cat_boost_basic,0.729,0.694,0.649
1,cat_boost_add_datetime_features,0.705,0.674,0.644
2,cat_boost_add_datetime_features_and_card_addr_...,0.809,0.77,0.656
3,cat_boost_all_before_and_freq_enc,0.815,0.774,0.658
4,cat_boost_all_before_and_stats,0.794,0.754,0.65
5,cat_boost_all_before_and_stats_d15,0.816,0.775,0.653
