In [1]:
import catboost as cb
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as pltfrom
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.inspection import permutation_importance


In [2]:
dataset_train = pd.read_parquet("dataset_train_small.parquet")
target_train = pd.read_parquet("target_train_small.parquet")

In [75]:
def num_unique_values(x):
    return len(set(x))

def num_values1(x):
    if 1 in x.value_counts():
        return x.value_counts()[1]
    return 0
def num_values2(x):
    if 2 in x.value_counts():
        return x.value_counts()[2]
    return 0
def num_values3(x):
    if 3 in x.value_counts():
        return x.value_counts()[3]
    return 0
def num_values4(x):
    if 4 in x.value_counts():
        return x.value_counts()[4]
    return 0
agg_features = {
        "transaction_number": "max",
        "amnt": ["min", "max", "mean", "median"],
        "currency": num_unique_values,
        'operation_type_group': [num_values1, num_values2, num_values3, num_values4],
        'income_flag' : [num_values1, num_values2],
        'days_before': ['min', 'mean', 'max']

}

In [5]:

def get_features(dataset, target):
    features = dataset.groupby("app_id", as_index=False).agg(agg_features)
    features.columns = ['_'.join(col).strip('_') for col in features.columns.values]
    features = features.join(target.set_index("app_id"), "app_id")
    return features.drop(columns=["app_id", "flag"]), features[["app_id", "flag"]]

In [53]:

def newget_features(dataset, target):
    features = dataset.groupby("app_id", as_index=False).agg(agg_features)
    features.columns = ['_'.join(col).strip('_') for col in features.columns.values]
    features = features.join(target.set_index("app_id"), "app_id")
    return features.drop(columns=["app_id", "flag"]), features[["app_id", "flag"]]

In [88]:

def newget_featureswithouflag(dataset, target):
    features = dataset.groupby("app_id", as_index=False).agg(agg_features)
    features.columns = ['_'.join(col).strip('_') for col in features.columns.values]
    features = features.join(target.set_index("app_id"), "app_id")
    return features.drop(columns=["app_id"]), features[["app_id"]]

In [32]:

def get_features_withoutflag(dataset, target):
    features = dataset.groupby("app_id", as_index=False).agg(agg_features)
    features.columns = ['_'.join(col).strip('_') for col in features.columns.values]
    features = features.join(target.set_index("app_id"), "app_id")
    return features.drop(columns=["app_id"]), features[["app_id"]]

In [6]:
X_train, y_train = get_features(dataset_train, target_train)

In [8]:
xcb_train, xcb_test, ycb_train, ycb_test = train_test_split(X_train, y_train, test_size = 0.2, random_state=5)

In [19]:
train_dataset = cb.Pool(xcb_train, ycb_train['flag'])
test_dataset = cb.Pool(xcb_test, ycb_test)

In [11]:
model = cb.CatBoostRegressor(loss_function="RMSE")

In [20]:
grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1],
        'depth': [5, 7, 8, 10, 12],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}
model.grid_search(grid, train_dataset)

0:	learn: 0.1621167	test: 0.1640376	best: 0.1640376 (0)	total: 70.1ms	remaining: 6.94s
1:	learn: 0.1619602	test: 0.1638750	best: 0.1638750 (1)	total: 77.3ms	remaining: 3.79s
2:	learn: 0.1618153	test: 0.1637241	best: 0.1637241 (2)	total: 83.6ms	remaining: 2.7s
3:	learn: 0.1616755	test: 0.1635790	best: 0.1635790 (3)	total: 90.7ms	remaining: 2.18s
4:	learn: 0.1615435	test: 0.1634414	best: 0.1634414 (4)	total: 97.6ms	remaining: 1.85s
5:	learn: 0.1614289	test: 0.1633219	best: 0.1633219 (5)	total: 106ms	remaining: 1.66s
6:	learn: 0.1613142	test: 0.1632021	best: 0.1632021 (6)	total: 115ms	remaining: 1.53s
7:	learn: 0.1612056	test: 0.1630884	best: 0.1630884 (7)	total: 122ms	remaining: 1.4s
8:	learn: 0.1611050	test: 0.1629841	best: 0.1629841 (8)	total: 131ms	remaining: 1.33s
9:	learn: 0.1610078	test: 0.1628823	best: 0.1628823 (9)	total: 139ms	remaining: 1.25s
10:	learn: 0.1609149	test: 0.1627850	best: 0.1627850 (10)	total: 146ms	remaining: 1.18s
11:	learn: 0.1608287	test: 0.1626948	best: 0.1626

{'params': {'depth': 5,
  'l2_leaf_reg': 3,
  'iterations': 200,
  'learning_rate': 0.1},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,

In [23]:
pred = model.predict(xcb_test)
rmse = (np.sqrt(mean_squared_error(ycb_test['flag'], pred)))
r2 = r2_score(ycb_test['flag'], pred)
print("Testing performance")
print("RMSE: {:.2f}".format(rmse))
print("R2: {:.2f}".format(r2))

Testing performance
RMSE: 0.16
R2: 0.01


In [30]:
dataset_test = pd.read_parquet("dataset_test.parquet")

target_test = pd.read_parquet("target_test_contest.parquet")

In [33]:
X_test, y_test = get_features_withoutflag(dataset_test, target_test)

In [34]:
pred = model.predict(X_test)
y_test['flag'] = pred
y_test.to_csv("/Users/vladbax6/Codding/code_works/Python/Works/Alpha_ml/catboost.csv", index=False)

In [27]:
newtest = target_test
newtest['flag'] = pred

ValueError: Length of values (75470) does not match length of index (188674)

In [42]:
model.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'RMSE',
 'iterations': 200,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': True,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'subsample': 0.800000011920929,
 'use_best_model': False,
 'random_seed': 0,
 'depth': 5,
 'posterior_sampling': False,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_function': 'RMSE',
 'learning_rate': 0.10000000149011612,
 'score_function': 'C

In [47]:
model2 = cb.CatBoostRegressor(loss_function="RMSE")

In [48]:
grid = {'iterations': [190, 180],
        'learning_rate': [0.03, 0.1],
        'depth': [4, 5, 6,],
        'l2_leaf_reg': [0.2, 1, 3]}
model2.grid_search(grid, train_dataset)

0:	learn: 0.1621171	test: 0.1640377	best: 0.1640377 (0)	total: 9.8ms	remaining: 1.85s
1:	learn: 0.1619627	test: 0.1638782	best: 0.1638782 (1)	total: 18.8ms	remaining: 1.76s
2:	learn: 0.1618293	test: 0.1637396	best: 0.1637396 (2)	total: 25.9ms	remaining: 1.61s
3:	learn: 0.1617082	test: 0.1636150	best: 0.1636150 (3)	total: 35ms	remaining: 1.63s
4:	learn: 0.1615754	test: 0.1634765	best: 0.1634765 (4)	total: 42.2ms	remaining: 1.56s
5:	learn: 0.1614542	test: 0.1633483	best: 0.1633483 (5)	total: 50.8ms	remaining: 1.56s
6:	learn: 0.1613451	test: 0.1632350	best: 0.1632350 (6)	total: 59.3ms	remaining: 1.55s
7:	learn: 0.1612479	test: 0.1631336	best: 0.1631336 (7)	total: 66.8ms	remaining: 1.52s
8:	learn: 0.1611440	test: 0.1630249	best: 0.1630249 (8)	total: 76.2ms	remaining: 1.53s
9:	learn: 0.1610449	test: 0.1629200	best: 0.1629200 (9)	total: 85.2ms	remaining: 1.53s
10:	learn: 0.1609557	test: 0.1628272	best: 0.1628272 (10)	total: 93.9ms	remaining: 1.53s
11:	learn: 0.1608687	test: 0.1627355	best: 0

{'params': {'depth': 5,
  'l2_leaf_reg': 3,
  'iterations': 190,
  'learning_rate': 0.1},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,

In [80]:
model2.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'RMSE',
 'iterations': 190,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': True,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'subsample': 0.800000011920929,
 'use_best_model': False,
 'random_seed': 0,
 'depth': 5,
 'posterior_sampling': False,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_function': 'RMSE',
 'learning_rate': 0.10000000149011612,
 'score_function': 'C

In [50]:
pred = model2.predict(xcb_test)
rmse = (np.sqrt(mean_squared_error(ycb_test['flag'], pred)))
r2 = r2_score(ycb_test['flag'], pred)
print("Testing performance")
print("RMSE: {:.2f}".format(rmse))
print("R2: {:.2f}".format(r2))

Testing performance
RMSE: 0.16
R2: 0.01


In [51]:
pred = model2.predict(X_test)
y_test['flag'] = pred
y_test.to_csv("/Users/vladbax6/Codding/code_works/Python/Works/Alpha_ml/catboost.csv", index=False)

In [76]:
X_train, y_train = newget_features(dataset_train, target_train)

In [77]:
X_train.to_csv("/Users/vladbax6/Codding/code_works/Python/Works/Alpha_ml/Credit score classification/ourfeatures/x_train.csv")
y_train.to_csv("/Users/vladbax6/Codding/code_works/Python/Works/Alpha_ml/Credit score classification/ourfeatures/y_train.csv")

In [78]:
xcb_train, xcb_test, ycb_train, ycb_test = train_test_split(X_train, y_train, test_size = 0.2, random_state=5)


In [79]:
train_dataset = cb.Pool(xcb_train, ycb_train['flag'])


In [99]:
model3 = cb.CatBoostRegressor(loss_function="RMSE")

In [100]:
grid = {'iterations': [190],
        'learning_rate': [0.03, 0.1],
        'depth':  [5],
        'l2_leaf_reg': [3, 4]}
model3.grid_search(grid, train_dataset)

0:	learn: 0.1621162	test: 0.1640367	best: 0.1640367 (0)	total: 15.7ms	remaining: 2.89s
1:	learn: 0.1619637	test: 0.1638782	best: 0.1638782 (1)	total: 26.2ms	remaining: 2.4s
2:	learn: 0.1618161	test: 0.1637246	best: 0.1637246 (2)	total: 35.4ms	remaining: 2.15s
3:	learn: 0.1616829	test: 0.1635879	best: 0.1635879 (3)	total: 45.2ms	remaining: 2.04s
4:	learn: 0.1615598	test: 0.1634596	best: 0.1634596 (4)	total: 54.3ms	remaining: 1.95s
5:	learn: 0.1614485	test: 0.1633443	best: 0.1633443 (5)	total: 64.8ms	remaining: 1.93s
6:	learn: 0.1613330	test: 0.1632238	best: 0.1632238 (6)	total: 73.5ms	remaining: 1.87s
7:	learn: 0.1612319	test: 0.1631182	best: 0.1631182 (7)	total: 82.1ms	remaining: 1.82s
8:	learn: 0.1611246	test: 0.1630046	best: 0.1630046 (8)	total: 90.2ms	remaining: 1.76s
9:	learn: 0.1610332	test: 0.1629099	best: 0.1629099 (9)	total: 100ms	remaining: 1.75s
10:	learn: 0.1609456	test: 0.1628190	best: 0.1628190 (10)	total: 110ms	remaining: 1.74s
11:	learn: 0.1608545	test: 0.1627218	best: 0

{'params': {'depth': 4,
  'l2_leaf_reg': 4,
  'iterations': 195,
  'learning_rate': 0.1},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,

In [101]:
pred = model3.predict(xcb_test)
rmse = (np.sqrt(mean_squared_error(ycb_test['flag'], pred)))
r2 = r2_score(ycb_test['flag'], pred)
print("Testing performance")
print("RMSE: {:.2f}".format(rmse))
print("R2: {:.2f}".format(r2))

Testing performance
RMSE: 0.16
R2: 0.02


In [89]:
X_test, y_test = get_features_withoutflag(dataset_test, target_test)

In [90]:
X_test.to_csv("/Users/vladbax6/Codding/code_works/Python/Works/Alpha_ml/Credit score classification/ourfeatures/x_test.csv")
y_test.to_csv("/Users/vladbax6/Codding/code_works/Python/Works/Alpha_ml/Credit score classification/ourfeatures/y_test.csv")

In [93]:
X_test

Unnamed: 0,transaction_number_max,amnt_min,amnt_max,amnt_mean,amnt_median,currency_num_unique_values,operation_type_group_num_values1,operation_type_group_num_values2,operation_type_group_num_values3,operation_type_group_num_values4,income_flag_num_values1,income_flag_num_values2,days_before_min,days_before_mean,days_before_max,product
0,16,0.271738,0.562335,0.364273,0.348838,1,11,5,0,0,10,6,28,50.375000,89,1
1,11,0.290811,0.657194,0.424697,0.356660,1,11,0,0,0,10,1,2,36.363636,43,0
2,242,0.000000,0.646036,0.387859,0.390598,1,237,5,0,0,237,5,1,39.541322,91,1
3,33,0.000000,0.560044,0.344616,0.350662,1,27,6,0,0,27,6,5,55.151515,92,1
4,23,0.196203,0.582099,0.358440,0.339223,1,23,0,0,0,23,0,11,39.695652,78,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188669,8,0.423667,0.507530,0.450073,0.433150,1,5,3,0,0,2,6,8,44.625000,71,1
188670,52,0.000000,0.652660,0.389890,0.410727,1,52,0,0,0,50,2,2,43.326923,89,0
188671,112,0.225878,0.683797,0.404899,0.387677,2,108,4,0,0,108,4,1,50.035714,92,1
188672,6,0.345373,0.510295,0.478856,0.505177,1,6,0,0,0,6,0,29,59.333333,87,0


In [102]:
pred = model3.predict(X_test)
y_test['flag'] = pred
y_test.to_csv("/Users/vladbax6/Codding/code_works/Python/Works/Alpha_ml/catboost.csv", index=False)

In [96]:
model3.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'RMSE',
 'iterations': 190,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': True,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'subsample': 0.800000011920929,
 'use_best_model': False,
 'random_seed': 0,
 'depth': 5,
 'posterior_sampling': False,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_function': 'RMSE',
 'learning_rate': 0.10000000149011612,
 'score_function': 'C

In [103]:
X_test.to_

Unnamed: 0,transaction_number_max,amnt_min,amnt_max,amnt_mean,amnt_median,currency_num_unique_values,operation_type_group_num_values1,operation_type_group_num_values2,operation_type_group_num_values3,operation_type_group_num_values4,income_flag_num_values1,income_flag_num_values2,days_before_min,days_before_mean,days_before_max,product
0,16,0.271738,0.562335,0.364273,0.348838,1,11,5,0,0,10,6,28,50.375000,89,1
1,11,0.290811,0.657194,0.424697,0.356660,1,11,0,0,0,10,1,2,36.363636,43,0
2,242,0.000000,0.646036,0.387859,0.390598,1,237,5,0,0,237,5,1,39.541322,91,1
3,33,0.000000,0.560044,0.344616,0.350662,1,27,6,0,0,27,6,5,55.151515,92,1
4,23,0.196203,0.582099,0.358440,0.339223,1,23,0,0,0,23,0,11,39.695652,78,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188669,8,0.423667,0.507530,0.450073,0.433150,1,5,3,0,0,2,6,8,44.625000,71,1
188670,52,0.000000,0.652660,0.389890,0.410727,1,52,0,0,0,50,2,2,43.326923,89,0
188671,112,0.225878,0.683797,0.404899,0.387677,2,108,4,0,0,108,4,1,50.035714,92,1
188672,6,0.345373,0.510295,0.478856,0.505177,1,6,0,0,0,6,0,29,59.333333,87,0


In [97]:
my_train_x = pd.read_csv("/Users/vladbax6/Codding/code_works/Python/Works/Alpha_ml/Credit score classification/ourfeatures/x_train.csv")
my_train_y = pd.read_csv("/Users/vladbax6/Codding/code_works/Python/Works/Alpha_ml/Credit score classification/ourfeatures/y_train.csv")

my_contest_x = pd.read_csv("/Users/vladbax6/Codding/code_works/Python/Works/Alpha_ml/Credit score classification/ourfeatures/x_test.csv")
my_contest_y = pd.read_csv("/Users/vladbax6/Codding/code_works/Python/Works/Alpha_ml/Credit score classification/ourfeatures/y_test.csv")