In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import train_test_split, StratifiedKFold, GroupKFold
from sklearn.metrics import roc_auc_score

__Задание 0:__ _выбрать любую модель машнного обучения и зафиксировать любой тип валидации. Обучить базовую модель и зафиксировать базовое качество модели. В каждом следующем задании нужно будет обучить выбранную модель и оценивать ее качество на зафиксированной схеме валидации. После каждого задания, требуется сделать вывод о достигаемом качестве модели, по сравнению с качестом из предыдущего шага._

In [2]:
train = pd.read_csv("data/assignment_2_train.csv")
test = pd.read_csv("data/assignment_2_test.csv")

print("train.shape = {} rows, {} cols".format(*train.shape))
print("test.shape = {} rows, {} cols".format(*test.shape))

train.shape = 180000 rows, 394 cols
test.shape = 100001 rows, 394 cols


In [3]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
target = "isFraud"

categorical_features = train.select_dtypes(exclude=[np.number]).columns.tolist()
train[categorical_features] = train[categorical_features].astype('category')
test[categorical_features] = test[categorical_features].astype('category')

In [5]:
x_train, x_valid, y_train, y_valid = train_test_split(train.drop(columns=[target]),
                                                      train[target],
                                                      test_size=.3,
                                                      shuffle=True,
                                                      random_state=27)

In [6]:
x_test, y_test = test.drop(columns=target), test[target]

In [7]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.1,
    "n_jobs": -1,
    "seed": 27
}

In [8]:
def get_trained_model(x_train, y_train, x_valid, y_valid, categorical_features, params):

    dtrain = lgb.Dataset(data=x_train, label=y_train, categorical_feature=categorical_features)
    dvalid = lgb.Dataset(data=x_valid, label=y_valid, categorical_feature=categorical_features)

    model = lgb.train(
        params=params,
        train_set=dtrain,
        num_boost_round=1000,
        categorical_feature=categorical_features,
        valid_sets=[dtrain, dvalid],
        early_stopping_rounds=10,
        verbose_eval=20
    )
    
    return model

In [9]:
def get_predict(model, x_train, y_train, x_valid, y_valid, x_test, y_test):
    pred_train = model.predict(x_train)
    pred_valid = model.predict(x_valid)
    pred_test = model.predict(x_test)

    print(f"Train-score: {round(roc_auc_score(y_train, pred_train), 5)}\n"
          f"Valid-score: {round(roc_auc_score(y_valid, pred_valid), 5)}\n"
          f"Test-score: {round(roc_auc_score(y_test, pred_test), 5)}")
    
    return pred_train, pred_valid, pred_test

In [10]:
model = get_trained_model(x_train, y_train, x_valid, y_valid, categorical_features, params)

Training until validation scores don't improve for 10 rounds
[20]	training's auc: 0.919316	valid_1's auc: 0.897734
[40]	training's auc: 0.945101	valid_1's auc: 0.916738
[60]	training's auc: 0.962418	valid_1's auc: 0.924624
[80]	training's auc: 0.971311	valid_1's auc: 0.930257
[100]	training's auc: 0.97695	valid_1's auc: 0.934241
[120]	training's auc: 0.981615	valid_1's auc: 0.936605
[140]	training's auc: 0.985044	valid_1's auc: 0.938482
[160]	training's auc: 0.989038	valid_1's auc: 0.940047
[180]	training's auc: 0.991088	valid_1's auc: 0.941088
[200]	training's auc: 0.99241	valid_1's auc: 0.942041
[220]	training's auc: 0.993984	valid_1's auc: 0.94352
[240]	training's auc: 0.994723	valid_1's auc: 0.944105
[260]	training's auc: 0.995457	valid_1's auc: 0.944976
[280]	training's auc: 0.996146	valid_1's auc: 0.945854
[300]	training's auc: 0.996624	valid_1's auc: 0.946321
[320]	training's auc: 0.997097	valid_1's auc: 0.946737
Early stopping, best iteration is:
[323]	training's auc: 0.997149	

In [11]:
get_predict(model, x_train, y_train, x_valid, y_valid, x_test, y_test);

Train-score: 0.99715
Valid-score: 0.94678
Test-score: 0.848


__Задание 1:__ _признак TransactionDT - это смещение в секундах относительно базовой даты. Базовая дата - 2017-12-01, преобразовать признак TransactionDT в datetime, прибавив к базовой дате исходное значение признака. Из полученного признака выделить год, месяц, день недели, час, день._

In [12]:
start_sec = pd.to_datetime("2017-12-01").timestamp()

for df in [x_train, x_valid, x_test]:

    df['TransactionDT'] = pd.to_datetime(df['TransactionDT'] + start_sec, unit='s')

    df['year'] = df['TransactionDT'].dt.year
    df['month'] = df['TransactionDT'].dt.month
    df['weekday'] = df['TransactionDT'].dt.weekday
    df['hour'] = df['TransactionDT'].dt.hour
    df['day'] = df['TransactionDT'].dt.day
    df.drop(columns=['TransactionDT'], inplace=True)

In [13]:
x_train.head()

Unnamed: 0,TransactionID,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,V335,V336,V337,V338,V339,year,month,weekday,hour,day
137327,3124327,117.0,W,2772,512.0,150.0,visa,226.0,debit,310.0,...,,,,,,2018,1,1,0,2
26253,3013253,50.0,H,3663,307.0,150.0,visa,226.0,debit,264.0,...,0.0,0.0,0.0,0.0,0.0,2017,12,4,18,8
178369,3165369,59.0,W,9128,555.0,150.0,visa,166.0,debit,476.0,...,,,,,,2018,1,6,23,14
93470,3080470,100.0,R,1668,399.0,150.0,american express,198.0,credit,330.0,...,0.0,0.0,0.0,0.0,0.0,2017,12,5,12,23
174901,3161901,29.0,W,8431,269.0,150.0,mastercard,224.0,debit,299.0,...,,,,,,2018,1,5,20,13


In [14]:
model = get_trained_model(x_train, y_train, x_valid, y_valid, categorical_features, params)

Training until validation scores don't improve for 10 rounds
[20]	training's auc: 0.919585	valid_1's auc: 0.897884
[40]	training's auc: 0.9456	valid_1's auc: 0.915494
[60]	training's auc: 0.961727	valid_1's auc: 0.924184
[80]	training's auc: 0.971646	valid_1's auc: 0.928262
[100]	training's auc: 0.977393	valid_1's auc: 0.931487
[120]	training's auc: 0.982079	valid_1's auc: 0.934645
[140]	training's auc: 0.985664	valid_1's auc: 0.936539
[160]	training's auc: 0.988684	valid_1's auc: 0.937814
[180]	training's auc: 0.991113	valid_1's auc: 0.93898
[200]	training's auc: 0.992276	valid_1's auc: 0.941023
[220]	training's auc: 0.993572	valid_1's auc: 0.942176
[240]	training's auc: 0.994746	valid_1's auc: 0.943583
[260]	training's auc: 0.995807	valid_1's auc: 0.94449
[280]	training's auc: 0.99645	valid_1's auc: 0.945109
[300]	training's auc: 0.99703	valid_1's auc: 0.946191
[320]	training's auc: 0.997327	valid_1's auc: 0.947002
Early stopping, best iteration is:
[326]	training's auc: 0.997748	val

In [15]:
get_predict(model, x_train, y_train, x_valid, y_valid, x_test, y_test);

Train-score: 0.99775
Valid-score: 0.94707
Test-score: 0.85115


Качество выросло на всех выборках за счет рассматривания выделенных элементов времени, как отдельных признаков.

__Задание 2:__ _сделать конкатенацию признаков_

* _card1 + card2;_
* _card1 + card2 + card_3 + card_5;_
* _card1 + card2 + card_3 + card_5 + addr1 + addr2_

_Рассматривать их как категориальных признаки._

In [16]:
add_features = ["card1|card2", "card1|card2|card3|card5", "card1|card2|card3|card5|addr1|addr2"]

for df in [x_train, x_valid, x_test]:
    
    df["card1|card2"] = df["card1"].astype('str') + \
                        df["card2"].astype('str')
    
    df["card1|card2|card3|card5"] = df["card1"].astype('str') + \
                                    df["card2"].astype('str') + \
                                    df["card3"].astype('str') + \
                                    df["card5"].astype('str')
    
    df["card1|card2|card3|card5|addr1|addr2"] = df["card1"].astype('str') + \
                                                df["card2"].astype('str') + \
                                                df["card3"].astype('str') + \
                                                df["card5"].astype('str') + \
                                                df["addr1"].astype('str') + \
                                                df["addr2"].astype('str')
    
    df[add_features] = df[add_features].astype("category")

In [17]:
for feature in add_features:
    categorical_features.append(feature)

categorical_features

['ProductCD',
 'card4',
 'card6',
 'P_emaildomain',
 'R_emaildomain',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'card1|card2',
 'card1|card2|card3|card5',
 'card1|card2|card3|card5|addr1|addr2']

In [18]:
model = get_trained_model(x_train, y_train, x_valid, y_valid, categorical_features, params)

Training until validation scores don't improve for 10 rounds
[20]	training's auc: 0.961499	valid_1's auc: 0.919285
[40]	training's auc: 0.978065	valid_1's auc: 0.933737
[60]	training's auc: 0.985491	valid_1's auc: 0.93918
[80]	training's auc: 0.990455	valid_1's auc: 0.942965
[100]	training's auc: 0.993046	valid_1's auc: 0.945337
[120]	training's auc: 0.99476	valid_1's auc: 0.946604
[140]	training's auc: 0.996045	valid_1's auc: 0.94788
[160]	training's auc: 0.997206	valid_1's auc: 0.94839
[180]	training's auc: 0.998166	valid_1's auc: 0.948652
Early stopping, best iteration is:
[174]	training's auc: 0.997886	valid_1's auc: 0.948774


In [19]:
get_predict(model, x_train, y_train, x_valid, y_valid, x_test, y_test);

Train-score: 0.99789
Valid-score: 0.94877
Test-score: 0.8507


In [20]:
def features_nan(dfs_dict, features):
    
    for key, df in dfs_dict.items():

        print(f"{key}:")
        
        for feature in features:

            print(f"Nunique '{feature}': {df[feature].nunique()}")

            print(f"""Count NaN '{feature}': {df[feature].isna().sum()} """
                  f"""({df[feature].isna().sum()/df[feature].shape[0]:.2%})""")

        print("*" * len(feature) * 2)
        
    return True

In [21]:
dfs_dict={"x_train": x_train, "x_valid": x_valid, "x_test": x_test}
features_nan(dfs_dict, add_features);

x_train:
Nunique 'card1|card2': 8525
Count NaN 'card1|card2': 0 (0.00%)
Nunique 'card1|card2|card3|card5': 8642
Count NaN 'card1|card2|card3|card5': 0 (0.00%)
Nunique 'card1|card2|card3|card5|addr1|addr2': 21762
Count NaN 'card1|card2|card3|card5|addr1|addr2': 0 (0.00%)
**********************************************************************
x_valid:
Nunique 'card1|card2': 6127
Count NaN 'card1|card2': 0 (0.00%)
Nunique 'card1|card2|card3|card5': 6202
Count NaN 'card1|card2|card3|card5': 0 (0.00%)
Nunique 'card1|card2|card3|card5|addr1|addr2': 14115
Count NaN 'card1|card2|card3|card5|addr1|addr2': 0 (0.00%)
**********************************************************************
x_test:
Nunique 'card1|card2': 7149
Count NaN 'card1|card2': 0 (0.00%)
Nunique 'card1|card2|card3|card5': 7182
Count NaN 'card1|card2|card3|card5': 0 (0.00%)
Nunique 'card1|card2|card3|card5|addr1|addr2': 15981
Count NaN 'card1|card2|card3|card5|addr1|addr2': 0 (0.00%)
**********************************************

Модель обучилась быстрее, но ее качество упало на тестовой выборке, что связано с большим количеством уникальных категорий новых признаков и разного количества в разных выборках.

__Задание 3:__ _Сделать FrequencyEncoder для признаков card1 - card6, addr1, addr2._

In [22]:
freq_features = ["card1", "card2", "card3", "card5", "addr1", "addr2"]

for df in [x_train, x_valid, x_test]:
    
    for feature in freq_features:

        freq_encoder = df[feature].value_counts(normalize=True)
        df[f"{feature}_freq_enc"] = df[feature].map(freq_encoder)

In [23]:
x_train.head()

Unnamed: 0,TransactionID,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,day,card1|card2,card1|card2|card3|card5,card1|card2|card3|card5|addr1|addr2,card1_freq_enc,card2_freq_enc,card3_freq_enc,card5_freq_enc,addr1_freq_enc,addr2_freq_enc
137327,3124327,117.0,W,2772,512.0,150.0,visa,226.0,debit,310.0,...,2,2772512.0,2772512.0150.0226.0,2772512.0150.0226.0310.087.0,0.003032,0.013744,0.879228,0.514204,0.015424,0.982315
26253,3013253,50.0,H,3663,307.0,150.0,visa,226.0,debit,264.0,...,8,3663307.0,3663307.0150.0226.0,3663307.0150.0226.0264.087.0,7.9e-05,0.000298,0.879228,0.514204,0.071602,0.982315
178369,3165369,59.0,W,9128,555.0,150.0,visa,166.0,debit,476.0,...,14,9128555.0,9128555.0150.0166.0,9128555.0150.0166.0476.087.0,4e-05,0.071361,0.879228,0.081429,0.017177,0.982315
93470,3080470,100.0,R,1668,399.0,150.0,american express,198.0,credit,330.0,...,23,1668399.0,1668399.0150.0198.0,1668399.0150.0198.0330.087.0,7.9e-05,0.023897,0.879228,0.002282,0.046219,0.982315
174901,3161901,29.0,W,8431,269.0,150.0,mastercard,224.0,debit,299.0,...,13,8431269.0,8431269.0150.0224.0,8431269.0150.0224.0299.087.0,0.000786,0.004187,0.879228,0.128569,0.096816,0.982315


In [24]:
model = get_trained_model(x_train, y_train, x_valid, y_valid, categorical_features, params)

Training until validation scores don't improve for 10 rounds
[20]	training's auc: 0.966155	valid_1's auc: 0.920846
[40]	training's auc: 0.983115	valid_1's auc: 0.933579
[60]	training's auc: 0.989833	valid_1's auc: 0.939242
[80]	training's auc: 0.993607	valid_1's auc: 0.942768
[100]	training's auc: 0.995718	valid_1's auc: 0.945113
[120]	training's auc: 0.996959	valid_1's auc: 0.946545
[140]	training's auc: 0.997814	valid_1's auc: 0.947437
[160]	training's auc: 0.998495	valid_1's auc: 0.948247
[180]	training's auc: 0.998984	valid_1's auc: 0.948856
[200]	training's auc: 0.999291	valid_1's auc: 0.949493
[220]	training's auc: 0.999499	valid_1's auc: 0.950183
[240]	training's auc: 0.999655	valid_1's auc: 0.950648
Early stopping, best iteration is:
[244]	training's auc: 0.999674	valid_1's auc: 0.950751


In [25]:
get_predict(model, x_train, y_train, x_valid, y_valid, x_test, y_test);

Train-score: 0.99967
Valid-score: 0.95075
Test-score: 0.83713


Качество на тренировочной и валидационной выборках выросло, но качество на тестовой выборке упало. Это вызвано тем, что есть категории новых признаков с одинаковой частотой.

__Задание 4:__ _Создать признаки на основе отношения: TransactionAmt к вычисленной статистике. Статистика - среднее значение / стандартное отклонение TransactionAmt, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2._

In [26]:
transaction_feature = [f"card{i}" for i in range(1, 7)] + [f"addr{i}" for i in range(1, 3)] + add_features
transaction_feature

['card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'card1|card2',
 'card1|card2|card3|card5',
 'card1|card2|card3|card5|addr1|addr2']

In [27]:
def add_stat_ratio(df_list, feat_name):
    
    result = []
    
    for df in df_list:
        for feat in transaction_feature:

            df = pd.merge(df, 
                          df[[feat_name, feat]]
                          .groupby(feat, sort=False)
                          .mean()
                          .rename(columns={feat_name: f"{feat_name}_{feat}_mean"}),
                          how="left", 
                          on=feat)

            df[f"{feat_name}_{feat}_mean"] = df[feat_name]/df[f"{feat_name}_{feat}_mean"]
        
        result.append(df)
    
    return result

In [28]:
x_train, x_valid, x_test = add_stat_ratio([x_train, x_valid, x_test], "TransactionAmt")

In [29]:
x_train.head()

Unnamed: 0,TransactionID,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,TransactionAmt_card2_mean,TransactionAmt_card3_mean,TransactionAmt_card4_mean,TransactionAmt_card5_mean,TransactionAmt_card6_mean,TransactionAmt_addr1_mean,TransactionAmt_addr2_mean,TransactionAmt_card1|card2_mean,TransactionAmt_card1|card2|card3|card5_mean,TransactionAmt_card1|card2|card3|card5|addr1|addr2_mean
0,3124327,117.0,W,2772,512.0,150.0,visa,226.0,debit,310.0,...,1.050328,0.834582,0.922416,0.878007,1.060273,0.972685,0.831735,0.99396,0.99396,0.886875
1,3013253,50.0,H,3663,307.0,150.0,visa,226.0,debit,264.0,...,0.228905,0.356659,0.394195,0.375217,0.453108,0.366303,0.355442,0.112106,0.112106,0.112106
2,3165369,59.0,W,9128,555.0,150.0,visa,166.0,debit,476.0,...,0.497906,0.420858,0.46515,0.6358,0.534668,0.475892,0.419422,0.684328,0.684328,1.0
3,3080470,100.0,R,1668,399.0,150.0,american express,198.0,credit,330.0,...,0.538961,0.713318,0.54473,0.817051,0.587168,0.779473,0.710885,1.169591,1.169591,1.0
4,3161901,29.0,W,8431,269.0,150.0,mastercard,224.0,debit,299.0,...,0.261625,0.206862,0.229632,0.274236,0.262803,0.165525,0.206157,0.187235,0.187235,0.852941


In [30]:
model = get_trained_model(x_train, y_train, x_valid, y_valid, categorical_features, params)

Training until validation scores don't improve for 10 rounds
[20]	training's auc: 0.966414	valid_1's auc: 0.920948
[40]	training's auc: 0.983413	valid_1's auc: 0.932825
[60]	training's auc: 0.990759	valid_1's auc: 0.938373
[80]	training's auc: 0.994666	valid_1's auc: 0.941644
[100]	training's auc: 0.996372	valid_1's auc: 0.944177
[120]	training's auc: 0.997433	valid_1's auc: 0.945771
[140]	training's auc: 0.998213	valid_1's auc: 0.946959
[160]	training's auc: 0.998792	valid_1's auc: 0.947624
[180]	training's auc: 0.999174	valid_1's auc: 0.948317
[200]	training's auc: 0.999382	valid_1's auc: 0.948543
[220]	training's auc: 0.999644	valid_1's auc: 0.948738
[240]	training's auc: 0.999777	valid_1's auc: 0.9494
[260]	training's auc: 0.999855	valid_1's auc: 0.949578
Early stopping, best iteration is:
[254]	training's auc: 0.999841	valid_1's auc: 0.949653


In [31]:
get_predict(model, x_train, y_train, x_valid, y_valid, x_test, y_test);

Train-score: 0.99984
Valid-score: 0.94965
Test-score: 0.84215


In [32]:
x_train[["TransactionAmt"] + transaction_feature].nunique()

TransactionAmt                          7407
card1                                   8417
card2                                    499
card3                                     83
card4                                      4
card5                                     90
card6                                      4
addr1                                    242
addr2                                     49
card1|card2                             8525
card1|card2|card3|card5                 8642
card1|card2|card3|card5|addr1|addr2    21762
dtype: int64

Качество модели уменьшилось на валидационной выборке, а на тренировочной и тестовой выросло. Это вызвано тем, что новый признак сильно зависит от объема выборки и от количества уникальных значений категорий, по которым выполняется группировка. Новые признаки имеют хорошую разделяющую способность, но как показали предыдущие итерации - качество с данным признаком нестабильно.

__Задание 5:__ _Создать признаки на основе отношения: D15 к вычисленной статистике. Статистика - среднее значение / стандартное отклонение D15, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2._

In [33]:
x_train, x_valid, x_test = add_stat_ratio([x_train, x_valid, x_test], "D15")

In [34]:
x_train.head()

Unnamed: 0,TransactionID,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,D15_card2_mean,D15_card3_mean,D15_card4_mean,D15_card5_mean,D15_card6_mean,D15_addr1_mean,D15_addr2_mean,D15_card1|card2_mean,D15_card1|card2|card3|card5_mean,D15_card1|card2|card3|card5|addr1|addr2_mean
0,3124327,117.0,W,2772,512.0,150.0,visa,226.0,debit,310.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3013253,50.0,H,3663,307.0,150.0,visa,226.0,debit,264.0,...,,,,,,,,,,
2,3165369,59.0,W,9128,555.0,150.0,visa,166.0,debit,476.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,3080470,100.0,R,1668,399.0,150.0,american express,198.0,credit,330.0,...,,,,,,,,,,
4,3161901,29.0,W,8431,269.0,150.0,mastercard,224.0,debit,299.0,...,0.019458,0.017707,0.021276,0.020977,0.018795,0.019488,0.017595,0.01921,0.01921,2.0


In [35]:
model = get_trained_model(x_train, y_train, x_valid, y_valid, categorical_features, params)

Training until validation scores don't improve for 10 rounds
[20]	training's auc: 0.965838	valid_1's auc: 0.920465
[40]	training's auc: 0.983532	valid_1's auc: 0.932448
[60]	training's auc: 0.990584	valid_1's auc: 0.938039
[80]	training's auc: 0.994423	valid_1's auc: 0.940909
[100]	training's auc: 0.996305	valid_1's auc: 0.943286
[120]	training's auc: 0.997349	valid_1's auc: 0.944495
[140]	training's auc: 0.99807	valid_1's auc: 0.945283
[160]	training's auc: 0.998627	valid_1's auc: 0.946277
[180]	training's auc: 0.999107	valid_1's auc: 0.946831
[200]	training's auc: 0.999416	valid_1's auc: 0.94745
[220]	training's auc: 0.999575	valid_1's auc: 0.94775
[240]	training's auc: 0.999719	valid_1's auc: 0.94815
Early stopping, best iteration is:
[232]	training's auc: 0.999643	valid_1's auc: 0.948257


In [36]:
get_predict(model, x_train, y_train, x_valid, y_valid, x_test, y_test);

Train-score: 0.99964
Valid-score: 0.94826
Test-score: 0.84096


In [37]:
x_train[["D15"] + transaction_feature].nunique()

D15                                      671
card1                                   8417
card2                                    499
card3                                     83
card4                                      4
card5                                     90
card6                                      4
addr1                                    242
addr2                                     49
card1|card2                             8525
card1|card2|card3|card5                 8642
card1|card2|card3|card5|addr1|addr2    21762
dtype: int64

In [38]:
dfs_dict={"x_train": x_train, "x_valid": x_valid, "x_test": x_test}

for key, df in dfs_dict.items():
    print(f"{key} - количество в D15 = nan: {df['D15'].isna().sum()} ({df['D15'].isna().sum()/df.shape[0]:.2%})")

x_train - количество в D15 = nan: 34045 (27.02%)
x_valid - количество в D15 = nan: 14774 (27.36%)
x_test - количество в D15 = nan: 9319 (9.32%)


Качество модели упало для всех выборок. Это может быть вызвано большим количеством пропусков значений признака D15 в тренировочной и валидационной выборках.

__Задание 6:__ _выделить дробную часть и целую часть признака TransactionAmt в два отдельных признака. После создать отдельных признак - логарифм от TransactionAmt_

In [39]:
for df in [x_train, x_valid, x_test]:
    df["TransactionAmtInt"] = df["TransactionAmt"].astype('int32')
    df["TransactionAmtFrac"] = df["TransactionAmt"] - df["TransactionAmtInt"]
    
    df["TransactionAmtLog"] = np.log(df["TransactionAmt"])

In [40]:
x_train.head()

Unnamed: 0,TransactionID,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,D15_card5_mean,D15_card6_mean,D15_addr1_mean,D15_addr2_mean,D15_card1|card2_mean,D15_card1|card2|card3|card5_mean,D15_card1|card2|card3|card5|addr1|addr2_mean,TransactionAmtInt,TransactionAmtFrac,TransactionAmtLog
0,3124327,117.0,W,2772,512.0,150.0,visa,226.0,debit,310.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117,0.0,4.762174
1,3013253,50.0,H,3663,307.0,150.0,visa,226.0,debit,264.0,...,,,,,,,,50,0.0,3.912023
2,3165369,59.0,W,9128,555.0,150.0,visa,166.0,debit,476.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,59,0.0,4.077537
3,3080470,100.0,R,1668,399.0,150.0,american express,198.0,credit,330.0,...,,,,,,,,100,0.0,4.60517
4,3161901,29.0,W,8431,269.0,150.0,mastercard,224.0,debit,299.0,...,0.020977,0.018795,0.019488,0.017595,0.01921,0.01921,2.0,29,0.0,3.367296


In [41]:
model = get_trained_model(x_train, y_train, x_valid, y_valid, categorical_features, params)

Training until validation scores don't improve for 10 rounds
[20]	training's auc: 0.965838	valid_1's auc: 0.920465
[40]	training's auc: 0.98331	valid_1's auc: 0.933991
[60]	training's auc: 0.991276	valid_1's auc: 0.939328
[80]	training's auc: 0.994818	valid_1's auc: 0.941924
[100]	training's auc: 0.996815	valid_1's auc: 0.943323
[120]	training's auc: 0.997837	valid_1's auc: 0.944321
[140]	training's auc: 0.998369	valid_1's auc: 0.945903
[160]	training's auc: 0.998947	valid_1's auc: 0.946636
[180]	training's auc: 0.999327	valid_1's auc: 0.947558
[200]	training's auc: 0.999584	valid_1's auc: 0.948136
[220]	training's auc: 0.999699	valid_1's auc: 0.948805
[240]	training's auc: 0.999803	valid_1's auc: 0.949168
Early stopping, best iteration is:
[245]	training's auc: 0.999827	valid_1's auc: 0.94931


In [42]:
get_predict(model, x_train, y_train, x_valid, y_valid, x_test, y_test);

Train-score: 0.99983
Valid-score: 0.94931
Test-score: 0.83884


Качество выросло на тренировочной и валидационной выборках, но упало на тестовой. Это может говорить о более сложной связи между признаками, чем логарифм. Предыдущие итерации показали, что качество с данными признаками также неустойчиво, что говорит о меньшей важности этих признаков перед другими.

__Задание 7 (опция):__ _выполнить предварительную подготовку / очистку признаков P_emaildomain и R_emaildomain (что и как делать - остается на ваше усмотрение) и сделать Frequency Encoding для очищенных признаков._

In [43]:
dfs_dict={"x_train": x_train, "x_valid": x_valid, "x_test": x_test}
features_nan(dfs_dict, ["P_emaildomain", "R_emaildomain"]);

x_train:
Nunique 'P_emaildomain': 59
Count NaN 'P_emaildomain': 19916 (15.81%)
Nunique 'R_emaildomain': 60
Count NaN 'R_emaildomain': 83910 (66.60%)
**************************
x_valid:
Nunique 'P_emaildomain': 59
Count NaN 'P_emaildomain': 8524 (15.79%)
Nunique 'R_emaildomain': 59
Count NaN 'R_emaildomain': 35790 (66.28%)
**************************
x_test:
Nunique 'P_emaildomain': 59
Count NaN 'P_emaildomain': 15471 (15.47%)
Nunique 'R_emaildomain': 56
Count NaN 'R_emaildomain': 83405 (83.40%)
**************************


In [44]:
freq_features = ["P_emaildomain", "R_emaildomain"]

for df in [x_train, x_valid, x_test]:
    
    for feature in freq_features:

        freq_encoder = df[feature].astype("str").fillna("without_email").value_counts(normalize=True)
        df[f"{feature}_freq_enc"] = df[feature].astype("str").map(freq_encoder)

In [45]:
x_train.head()

Unnamed: 0,TransactionID,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,D15_addr1_mean,D15_addr2_mean,D15_card1|card2_mean,D15_card1|card2|card3|card5_mean,D15_card1|card2|card3|card5|addr1|addr2_mean,TransactionAmtInt,TransactionAmtFrac,TransactionAmtLog,P_emaildomain_freq_enc,R_emaildomain_freq_enc
0,3124327,117.0,W,2772,512.0,150.0,visa,226.0,debit,310.0,...,0.0,0.0,0.0,0.0,0.0,117,0.0,4.762174,0.003619,0.665952
1,3013253,50.0,H,3663,307.0,150.0,visa,226.0,debit,264.0,...,,,,,,50,0.0,3.912023,0.161111,0.031063
2,3165369,59.0,W,9128,555.0,150.0,visa,166.0,debit,476.0,...,0.0,0.0,0.0,0.0,,59,0.0,4.077537,0.158063,0.665952
3,3080470,100.0,R,1668,399.0,150.0,american express,198.0,credit,330.0,...,,,,,,100,0.0,4.60517,0.083508,0.055921
4,3161901,29.0,W,8431,269.0,150.0,mastercard,224.0,debit,299.0,...,0.019488,0.017595,0.01921,0.01921,2.0,29,0.0,3.367296,0.373214,0.665952


In [46]:
model = get_trained_model(x_train, y_train, x_valid, y_valid, categorical_features, params)

Training until validation scores don't improve for 10 rounds
[20]	training's auc: 0.96586	valid_1's auc: 0.921272
[40]	training's auc: 0.983304	valid_1's auc: 0.933241
[60]	training's auc: 0.991109	valid_1's auc: 0.939045
[80]	training's auc: 0.994634	valid_1's auc: 0.941526
[100]	training's auc: 0.996511	valid_1's auc: 0.943565
[120]	training's auc: 0.997832	valid_1's auc: 0.945141
[140]	training's auc: 0.998495	valid_1's auc: 0.946738
[160]	training's auc: 0.998839	valid_1's auc: 0.948078
[180]	training's auc: 0.999164	valid_1's auc: 0.948592
[200]	training's auc: 0.999424	valid_1's auc: 0.948926
Early stopping, best iteration is:
[202]	training's auc: 0.999432	valid_1's auc: 0.949015


In [47]:
get_predict(model, x_train, y_train, x_valid, y_valid, x_test, y_test);

Train-score: 0.99943
Valid-score: 0.94901
Test-score: 0.84527


Качество на тренировочной и валидационной выборке упало, но на тестовой - выросло. Это говорит о том, что данные признаки имеют хорошую разделяющую способность. Переобучение модели уменьшилось. Также это говорит о важности предварительной обработки и очистки признака.

In [48]:
x_train["isFraud"] = y_train.values
x_valid["isFraud"] = y_valid.values
x_test["isFraud"] = y_test.values

In [49]:
x_train.to_csv("data/new_feature_train.csv")
x_valid.to_csv("data/new_feature_valid.csv")
x_test.to_csv("data/new_feature_test.csv")