In [1]:
import warnings
from tqdm import tqdm
from typing import List, Tuple

import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
import matplotlib.pyplot as plt
from scipy.stats import ttest_rel

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.base import BaseEstimator, TransformerMixin

from scipy.sparse import hstack
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import FeatureUnion




from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_val_score
warnings.simplefilter("ignore")
%matplotlib inline

from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

In [2]:
train = pd.read_csv('../lesson_03/data/assignment_2_train.csv')
test = pd.read_csv('../lesson_03/data/assignment_2_test.csv')

In [3]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]

In [4]:
def feature_selector(X):
    numerical_features = X.select_dtypes(include=[np.number])
    numerical_features = numerical_features.columns.tolist()
    
    cat_features = X.select_dtypes(include=[np.object])
    cat_features = cat_features.columns.tolist()
    
    dummies_features = pd.get_dummies(X[cat_features])
    X = pd.concat([X, dummies_features], axis=1)
    dummies_features = dummies_features.columns.tolist()
    X = X.drop(cat_features, axis=1)
    return X

In [5]:
X = train.drop(['isFraud'], axis=1)
y = train['isFraud']
print("data.shape = {} rows, {} cols".format(*X.shape))
X_leader_board = test.drop(['isFraud'], axis=1)
y_leader_board = test['isFraud']
print("test.shape = {} rows, {} cols".format(*test.shape))

data.shape = 180000 rows, 393 cols
test.shape = 100001 rows, 394 cols


In [6]:
X = feature_selector(X)
X_leader_board = feature_selector(X_leader_board)
drop_features = list(set(X) - set(X_leader_board))
X = X.drop(drop_features, axis=1)

In [7]:
result = []

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Columns: 526 entries, TransactionID to M9_T
dtypes: float64(376), int64(3), uint8(147)
memory usage: 545.7 MB


In [9]:
x_train, x_valid, y_train, y_valid = train_test_split(X, 
                                                    y, test_size=0.3, random_state=0)

Задание 0: выбрать любую модель машнного обучения и зафиксировать любой тип валидации. Обучить базовую модель и зафиксировать базовое качество модели. В каждом следующем задании нужно будет обучить выбранную модель и оценивать ее качество на зафиксированной схеме валидации. После каждого задания, требуется сделать вывод о достигаемом качестве модели, по сравнению с качестом из предыдущего шага.

In [10]:
cb_params = {
    "n_estimators": 10000,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "learning_rate":0.4,
    "max_bin": 20,
    "verbose": 200,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": -1,
    "random_seed": 42
}


In [11]:
%%time

model = cat.CatBoostClassifier(**cb_params)
model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)])

0:	test: 0.6896168	test1: 0.6928831	best: 0.6928831 (0)	total: 110ms	remaining: 18m 22s
200:	test: 0.9245902	test1: 0.9102672	best: 0.9102672 (200)	total: 8.72s	remaining: 7m 5s
400:	test: 0.9401757	test1: 0.9194672	best: 0.9195065 (398)	total: 17.3s	remaining: 6m 53s
600:	test: 0.9497255	test1: 0.9264306	best: 0.9264306 (600)	total: 25.8s	remaining: 6m 42s
800:	test: 0.9560197	test1: 0.9296019	best: 0.9296019 (800)	total: 33.9s	remaining: 6m 29s
1000:	test: 0.9624965	test1: 0.9317158	best: 0.9317174 (999)	total: 42.4s	remaining: 6m 21s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9318084919
bestIteration = 1007

Shrink model to first 1008 iterations.
CPU times: user 7min 19s, sys: 27.6 s, total: 7min 47s
Wall time: 45.7 s


<catboost.core.CatBoostClassifier at 0x7f3be2357700>

In [12]:
train_score = roc_auc_score(y_train, model.predict(x_train))
valid_score = roc_auc_score(y_valid, model.predict(x_valid))
leader_board = roc_auc_score(y_leader_board, model.predict(X_leader_board))

print(f"Train-score: {round(train_score, 3)}, Validation-score: {round(valid_score, 3)}, Leader_board-score: {round(leader_board, 3)}")

result.append({"method":"cat_boost_basic" ,"Train-score" : round(train_score, 3), "Validation-score" : round(valid_score, 3),
               "Leader_board-score" : round(leader_board, 3)    })

Train-score: 0.809, Validation-score: 0.751, Leader_board-score: 0.655


### Задание 1: признак TransactionDT - это смещение в секундах относительно базовой даты. Базовая дата - 2017-12-01, преобразовать признак TransactionDT в datetime, прибавив к базовой дате исходное значение признака. Из полученного признака выделить год, месяц, день недели, час, день.

In [13]:
X['TransactionDT'] = pd.to_datetime(X['TransactionDT'], unit='s', origin='2017-12-01')
X_leader_board['TransactionDT'] = pd.to_datetime(X_leader_board['TransactionDT'], unit='s', origin='2017-12-01')

In [14]:
start_date = '2017-12-01'

X["year"] = X["TransactionDT"].dt.year
X["month_of_year"] = X["TransactionDT"].dt.month
X["day_of_week"] = X["TransactionDT"].dt.weekday
X["hour_of_day"] = X["TransactionDT"].dt.hour
X["day_of_year"] = X["TransactionDT"].dt.day

X.head(n=2)

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,M7_T,M8_F,M8_T,M9_F,M9_T,year,month_of_year,day_of_week,hour_of_day,day_of_year
0,2987000,2017-12-02 00:00:00,68.5,13926,,150.0,142.0,315.0,87.0,19.0,...,0,0,0,0,0,2017,12,5,0,2
1,2987001,2017-12-02 00:00:01,29.0,2755,404.0,150.0,102.0,325.0,87.0,,...,0,0,0,0,0,2017,12,5,0,2


In [15]:
start_date = '2017-12-01'

X_leader_board["year"] = X_leader_board["TransactionDT"].dt.year
X_leader_board["month_of_year"] = X_leader_board["TransactionDT"].dt.month
X_leader_board["day_of_week"] = X_leader_board["TransactionDT"].dt.weekday
X_leader_board["hour_of_day"] = X_leader_board["TransactionDT"].dt.hour
X_leader_board["day_of_year"] = X_leader_board["TransactionDT"].dt.day

X_leader_board.head(n=2)

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,M7_T,M8_F,M8_T,M9_F,M9_T,year,month_of_year,day_of_week,hour_of_day,day_of_year
0,3287000,2018-02-24 19:43:58,226.0,12473,555.0,150.0,226.0,299.0,87.0,116.0,...,0,1,0,0,1,2018,2,5,19,24
1,3287001,2018-02-24 19:44:14,3072.0,15651,417.0,150.0,226.0,330.0,87.0,,...,0,0,0,0,0,2018,2,5,19,24


In [16]:
X = X.drop('TransactionDT', axis=1)
X_leader_board = X_leader_board.drop('TransactionDT', axis=1)

In [17]:
x_train, x_valid, y_train, y_valid = train_test_split(X, 
                                                    y, test_size=0.3, random_state=0)

In [18]:
%%time

model = cat.CatBoostClassifier(**cb_params)
model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)])

0:	test: 0.7434256	test1: 0.7287194	best: 0.7287194 (0)	total: 54.7ms	remaining: 9m 6s
200:	test: 0.9226335	test1: 0.9089143	best: 0.9089143 (200)	total: 8.28s	remaining: 6m 43s
400:	test: 0.9403300	test1: 0.9207312	best: 0.9207312 (400)	total: 16.6s	remaining: 6m 37s
600:	test: 0.9507563	test1: 0.9271339	best: 0.9271339 (600)	total: 24.9s	remaining: 6m 29s
800:	test: 0.9573399	test1: 0.9310478	best: 0.9311295 (789)	total: 33.1s	remaining: 6m 20s
1000:	test: 0.9619992	test1: 0.9334918	best: 0.9335198 (996)	total: 41.3s	remaining: 6m 10s
1200:	test: 0.9659827	test1: 0.9348924	best: 0.9348955 (1194)	total: 49.5s	remaining: 6m 2s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9356886527
bestIteration = 1245

Shrink model to first 1246 iterations.
CPU times: user 8min 48s, sys: 34 s, total: 9min 22s
Wall time: 54.4 s


<catboost.core.CatBoostClassifier at 0x7f3be26ab250>

In [19]:
train_score = roc_auc_score(y_train, model.predict(x_train))
valid_score = roc_auc_score(y_valid, model.predict(x_valid))
leader_board = roc_auc_score(y_leader_board, model.predict(X_leader_board))

print(f"Train-score: {round(train_score, 3)}, Validation-score: {round(valid_score, 3)}, Leader_board-score: {round(leader_board, 3)}")

result.append({"method":"cat_boost_add_datetime_features" ,"Train-score" : round(train_score, 3), "Validation-score" : round(valid_score, 3),
               "Leader_board-score" : round(leader_board, 3)    })

Train-score: 0.822, Validation-score: 0.751, Leader_board-score: 0.656


In [20]:
models_results = pd.DataFrame(result)
pd.pivot_table(models_results, columns = 'method')

method,cat_boost_add_datetime_features,cat_boost_basic
Leader_board-score,0.656,0.655
Train-score,0.822,0.809
Validation-score,0.751,0.751


Вывод: Добавив признаки на основе времени точность модели немного улучшилась

### Задание 2: сделать конкатенацию признаков
* card1 + card2;
* card1 + card2 + card_3 + card_5;
* card1 + card2 + card_3 + card_5 + addr1 + addr2

Рассматривать их как категориальных признаки.

In [21]:
card1_2 = X[["card1", "card2"]].astype('str')
X["card1_2"] = card1_2["card1"] + " | " + card1_2["card2"]
card1_2_lb = X_leader_board[["card1", "card2"]].astype('str')
X_leader_board['card1_2'] = card1_2_lb["card1"] + " | " + card1_2_lb["card2"]


In [22]:
card1_2_3_5 = X[["card1", "card2", "card3", "card5"]].astype('str')
X["card1_2_3_5"] = card1_2_3_5["card1"] + " | " + card1_2_3_5["card2"] \
    + " | " + card1_2_3_5["card3"] + " | " + card1_2_3_5["card3"]

card1_2_3_5_lb = X_leader_board[["card1", "card2", "card3", "card5"]].astype('str')
X_leader_board["card1_2_3_5"] = card1_2_3_5_lb["card1"] + " | " + card1_2_3_5_lb["card2"] \
    + " | " + card1_2_3_5_lb["card3"] + " | " + card1_2_3_5_lb["card3"]


    

In [23]:
card1_2_3_5_adr1_2 = X[["card1", "card2", "card3", "card5", "addr1", "addr2"]].astype('str')
X["card1_2_3_5_adr1_2"] = card1_2_3_5_adr1_2["card1"] + " | " + card1_2_3_5_adr1_2["card2"] \
    + " | " + card1_2_3_5_adr1_2["card3"] + " | " + card1_2_3_5_adr1_2["card5"] \
    + " | " + card1_2_3_5_adr1_2["addr1"] + " | " + card1_2_3_5_adr1_2["addr2"]

card1_2_3_5_adr1_2_lb = X_leader_board[["card1", "card2", "card3", "card5", "addr1", "addr2"]].astype('str')
X_leader_board["card1_2_3_5_adr1_2"] = card1_2_3_5_adr1_2_lb["card1"] + " | " + card1_2_3_5_adr1_2_lb["card2"] \
    + " | " + card1_2_3_5_adr1_2_lb["card3"] + " | " + card1_2_3_5_adr1_2_lb["card5"] \
    + " | " + card1_2_3_5_adr1_2_lb["addr1"] + " | " + card1_2_3_5_adr1_2_lb["addr2"]




In [24]:
cat_features = ('card1_2','card1_2_3_5', 'card1_2_3_5_adr1_2' )

In [25]:
x_train, x_valid, y_train, y_valid = train_test_split(X, 
                                                    y, test_size=0.3, random_state=0)

In [26]:
%%time

model = cat.CatBoostClassifier(**cb_params)
model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)], cat_features=cat_features)

0:	test: 0.6916088	test1: 0.6825916	best: 0.6825916 (0)	total: 103ms	remaining: 17m 8s
200:	test: 0.9724229	test1: 0.9421246	best: 0.9421246 (200)	total: 16.2s	remaining: 13m 9s
400:	test: 0.9775725	test1: 0.9464941	best: 0.9467362 (390)	total: 32.5s	remaining: 12m 57s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9480723159
bestIteration = 494

Shrink model to first 495 iterations.
CPU times: user 6min 34s, sys: 52.6 s, total: 7min 27s
Wall time: 45.7 s


<catboost.core.CatBoostClassifier at 0x7f3be2385df0>

In [27]:
train_score = roc_auc_score(y_train, model.predict(x_train))
valid_score = roc_auc_score(y_valid, model.predict(x_valid))
leader_board = roc_auc_score(y_leader_board, model.predict(X_leader_board))

print(f"Train-score: {round(train_score, 3)}, Validation-score: {round(valid_score, 3)}, Leader_board-score: {round(leader_board, 3)}")

result.append({"method":"cat_boost_add_datetime_features_and_card_addr_features_" ,"Train-score" : round(train_score, 3), "Validation-score" : round(valid_score, 3),
               "Leader_board-score" : round(leader_board, 3)    })

Train-score: 0.855, Validation-score: 0.8, Leader_board-score: 0.663


In [28]:
models_results = pd.DataFrame(result)
models_results

Unnamed: 0,method,Train-score,Validation-score,Leader_board-score
0,cat_boost_basic,0.809,0.751,0.655
1,cat_boost_add_datetime_features,0.822,0.751,0.656
2,cat_boost_add_datetime_features_and_card_addr_...,0.855,0.8,0.663


Вывод : добавление признаков ,
* card1 + card2;
* card1 + card2 + card_3 + card_5;
* card1 + card2 + card_3 + card_5 + addr1 + addr2
* улучшило модель 


Задание 3: Сделать FrequencyEncoder для признаков card1 - card6, addr1, addr2.

In [29]:
features_list = ('card1', 'card2', 'card3', 'card5', 'addr1', 'addr2')

def freq_encoder(df,features_list):
    
    for item in features_list:
        freq_encoder = df[item].value_counts(normalize=True)
        df[item+"_freq_enc"] = df[item].map(freq_encoder)
    
    return df.head(2)


In [30]:
freq_encoder(X, features_list)
freq_encoder(X_leader_board, features_list)


Unnamed: 0,TransactionID,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,...,day_of_year,card1_2,card1_2_3_5,card1_2_3_5_adr1_2,card1_freq_enc,card2_freq_enc,card3_freq_enc,card5_freq_enc,addr1_freq_enc,addr2_freq_enc
0,3287000,226.0,12473,555.0,150.0,226.0,299.0,87.0,116.0,,...,24,12473 | 555.0,12473 | 555.0 | 150.0 | 150.0,12473 | 555.0 | 150.0 | 226.0 | 299.0 | 87.0,0.00032,0.072942,0.881635,0.495638,0.083024,0.994213
1,3287001,3072.0,15651,417.0,150.0,226.0,330.0,87.0,,,...,24,15651 | 417.0,15651 | 417.0 | 150.0 | 150.0,15651 | 417.0 | 150.0 | 226.0 | 330.0 | 87.0,0.00269,0.004261,0.881635,0.495638,0.04889,0.994213


In [31]:
x_train, x_valid, y_train, y_valid = train_test_split(X, 
                                                    y, test_size=0.3, random_state=0)

In [32]:
%%time

model = cat.CatBoostClassifier(**cb_params)
model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)], cat_features=cat_features)

0:	test: 0.6125504	test1: 0.6060996	best: 0.6060996 (0)	total: 99.5ms	remaining: 16m 34s
200:	test: 0.9695010	test1: 0.9403217	best: 0.9403217 (200)	total: 16.6s	remaining: 13m 30s
400:	test: 0.9794021	test1: 0.9492791	best: 0.9493424 (390)	total: 31.9s	remaining: 12m 43s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9505020029
bestIteration = 549

Shrink model to first 550 iterations.
CPU times: user 7min 9s, sys: 51.2 s, total: 8min
Wall time: 49.1 s


<catboost.core.CatBoostClassifier at 0x7f3be2322670>

In [33]:
train_score = roc_auc_score(y_train, model.predict(x_train))
valid_score = roc_auc_score(y_valid, model.predict(x_valid))
leader_board = roc_auc_score(y_leader_board, model.predict(X_leader_board))

print(f"Train-score: {round(train_score, 3)}, Validation-score: {round(valid_score, 3)}, Leader_board-score: {round(leader_board, 3)}")

result.append({"method":"cat_boost_all_before_and_freq_enc" ,"Train-score" : round(train_score, 3), "Validation-score" : round(valid_score, 3),
               "Leader_board-score" : round(leader_board, 3)    })

Train-score: 0.862, Validation-score: 0.804, Leader_board-score: 0.661


In [34]:
models_results = pd.DataFrame(result)
models_results

Unnamed: 0,method,Train-score,Validation-score,Leader_board-score
0,cat_boost_basic,0.809,0.751,0.655
1,cat_boost_add_datetime_features,0.822,0.751,0.656
2,cat_boost_add_datetime_features_and_card_addr_...,0.855,0.8,0.663
3,cat_boost_all_before_and_freq_enc,0.862,0.804,0.661
