In [1]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv('parsed_final_data_season_3.csv.zip', compression='zip')
test_df = pd.read_csv('final_test_data_season_3.csv.zip', compression='zip')

  train_df = pd.read_csv('parsed_final_data_season_3.csv.zip', compression='zip')


In [3]:
train_df = train_df.drop('Unnamed: 0', axis=1)
test_df = test_df.drop('Unnamed: 0', axis=1)

In [4]:
target_column = 'click'
feature_columns = train_df.columns.difference([target_column, 'bid_id'])
categorical_features = ['region_id', 'city_id', 'ad_slot_visibility',
       'ad_slot_format', 'creative_id', 'advertiser_id', 'part_of_day',
       'weekday', 'weekend', 'os', 'browser',
       'is_mobile_device', 'is_male', 'is_long_term_interest', 'is_in_market',
       'is_info_cat', 'is_products_cat', 'is_service_cat',
       'is_entertainment_cat', 'is_girly_cat']

In [5]:
train_df = train_df.fillna(value=False)

In [6]:
class_counts = train_df[target_column].value_counts()
class_0_count = class_counts.get(0, 1)
class_1_count = class_counts.get(1, 1)
total_count = class_0_count + class_1_count
class_weights = [total_count / class_0_count, total_count / class_1_count]

In [7]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(train_df, test_size=0.5, random_state=42, stratify=train_df[target_column])
val_df, _ = train_test_split(val_df, test_size=0.9, random_state=42, stratify=val_df[target_column])

In [8]:
from copy import deepcopy
import random
import itertools

In [None]:
N_RUNS = 8
N_PAIR_FEATURES = 4

categorical_features = [
    'city_id', 'ad_slot_visibility', 'ad_slot_format',
    'creative_id', 'advertiser_id', 'part_of_day', 'weekday', 'weekend',
    'os', 'browser', 'is_in_market'
]
feature_columns = categorical_features + ["ad_slot_screen_share"]
for i in train_df.columns.difference([target_column] + feature_columns):
    train_df = train_df.drop(i, axis=1)
    test_df = test_df.drop(i, axis=1)
    val_df = val_df.drop(i, axis=1)
for run_idx in range(N_RUNS):
    base_cat_feats = deepcopy(categorical_features)
    
    possible_pairs = list(itertools.combinations(base_cat_feats, 2))
    if len(possible_pairs) > 0:
        pairs_for_union = random.sample(possible_pairs, min(N_PAIR_FEATURES, len(possible_pairs)))
    else:
        pairs_for_union = []
    
    new_feat_names = []
    
    for f1, f2 in pairs_for_union:
        new_name = f"{f1}__{f2}"
        new_feat_names.append(new_name)
        for df in [train_df, val_df, test_df]:
            df[new_name] = df[f1].astype(str) + "_" + df[f2].astype(str)
    
    all_feats = feature_columns + new_feat_names
    all_cat_feats = [f for f in all_feats if (f in categorical_features or f in new_feat_names)]
    
    train_pool = Pool(data=train_df[all_feats], label=train_df[target_column], cat_features=[all_feats.index(f) for f in all_cat_feats])
    val_pool = Pool(data=val_df[all_feats], label=val_df[target_column], cat_features=[all_feats.index(f) for f in all_cat_feats])
    test_pool = Pool(data=test_df[all_feats], label=test_df[target_column], cat_features=[all_feats.index(f) for f in all_cat_feats])
    
    model = CatBoostClassifier(
        iterations=100, learning_rate=0.01, depth=4, loss_function='Logloss', task_type='GPU',
        class_weights=class_weights, eval_metric='Logloss', verbose=0, l2_leaf_reg=3
    )
    model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=10, verbose=0)
    preds = model.predict_proba(test_pool)[:, 1]
    roc_auc = roc_auc_score(test_df[target_column], preds)
    feature_importances = model.get_feature_importance(train_pool)

    with open("mix_features_catboost_results.txt", "a") as myfile:
        myfile.write(f"\n Run {run_idx+1}/{N_RUNS}, ROC AUC: {roc_auc:.4f}, all features: {all_feats}, feature_importances: {feature_importances}\n")
    print(f"Run {run_idx+1}/{N_RUNS}, ROC AUC: {roc_auc:.4f}, Added features: {new_feat_names}, feature_importances: {feature_importances}")
    for df in [train_df, val_df, test_df]:
        for i in new_feat_names:
            df = df.drop(i, axis=1)


Run 1/8, ROC AUC: 0.6415, Added features: ['city_id__ad_slot_format', 'city_id__weekday', 'city_id__part_of_day', 'ad_slot_format__is_in_market'], feature_importances: [ 0.          8.1607835   3.24881787  6.92152143 14.0504878   2.90434152
 27.66738779  5.09073554 14.00756755  7.23690224  0.18878014  5.57299287
  2.72624631  0.          0.          2.22343542]
Run 2/8, ROC AUC: 0.6446, Added features: ['weekday__os', 'ad_slot_visibility__weekend', 'creative_id__weekend', 'ad_slot_format__creative_id'], feature_importances: [ 1.48572452  2.22262395  3.80313492  3.15702049 14.1683859   3.17084935
 24.08335537  2.47336265  8.23497403  6.66718266  1.63216639  5.73432589
  9.02872052  8.39952888  1.47689526  4.26174922]


IOStream.flush timed out


Run 3/8, ROC AUC: 0.6499, Added features: ['advertiser_id__os', 'os__browser', 'city_id__weekday', 'part_of_day__weekday'], feature_importances: [ 2.29147476  8.56915258  3.94081621  7.94490698 10.89911512  1.04621502
 23.56435168  2.95299143  9.34056695  9.27463561  1.57211849  5.05838949
  4.13051091  0.26213438  0.          9.15262038]
Run 4/8, ROC AUC: 0.6450, Added features: ['creative_id__os', 'ad_slot_visibility__creative_id', 'weekday__weekend', 'city_id__browser'], feature_importances: [ 1.54989402  6.86927241  3.84355801  4.5870013  13.8271054   2.7177935
 16.65266665  4.77781702 12.42311015  8.38089386  1.58664861  5.86255652
  1.19016189  4.50253259 11.22898807  0.        ]


[0;31mKernelOutOfMemory[0m: Kernel ran out of memory and has been restarted. If the restart fails, restart the kernel from the Kernel menu.
If the error persists, try choosing a different configuration or optimizing your code.