In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
!pip install hyperopt

In [None]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import numpy as np
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import log_loss

In [None]:
# data_frame = pd.read_csv('train.csv', nrows=10000, engine='c', index_col='id')
data_frame = pd.read_csv('/content/drive/My Drive/Colab Notebooks/train.csv', engine='c', index_col='id')

In [None]:
pd.set_option('display.max_columns', 24)

In [None]:
data_frame.shape

In [None]:
data_frame.head()

# Replace date & time with weekday & hour

In [None]:
def get_date(date):
    y = '20'+str(date)[:2]
    m = str(date)[2:4]
    d = str(date)[4:6]
    return y+'-'+m+'-'+d

In [None]:
def add_weekday_hour(data_frame):
    data_frame['date'] = data_frame['hour'].astype(str).str[:6]
    data_frame['hour'] = data_frame['hour'].astype(str).str[6:]
    data_frame['weekday'] = pd.to_datetime(data_frame.date.apply(get_date)).dt.dayofweek.astype(str)
    data_frame.drop(labels='date', axis=1, inplace=True)

In [None]:
add_weekday_hour(data_frame)

In [None]:
data_frame.head()

In [None]:
parser = int(.67 * len(data_frame))
parser

# Сreating subframes

In [None]:
def build_subframe(category, k=1):
    grouped = data_frame.groupby([category], axis=0, as_index=False)
    sizes = grouped.size()
    if k == 1:
        return grouped.get_group(sizes.iloc[np.argmax(sizes.iloc[:, 1]), 0])
    ind = np.argpartition(sizes.iloc[:, 1], -k)[-k:]
    largest_groups = []
    for i in ind:
        largest_groups.append(grouped.get_group(sizes.iloc[i, 0]))
    return largest_groups

# Subframe with the same site_id

In [None]:
site_id_subframes = build_subframe('site_id', 2)

In [None]:
site_id_subframe, site_id_subframe2 = site_id_subframes

In [None]:
site_id_subframe.shape

In [None]:
site_id_subframe2.shape

# Subframe with the same app_id

In [None]:
app_id_subframe = build_subframe('app_id')

In [None]:
app_id_subframe.shape

# Subframe with the same site_category

In [None]:
site_category_subframe, site_category_subframe2 = build_subframe('site_category', 2)

In [None]:
site_category_subframe.shape

# Subframe with the same app_category

In [None]:
app_category_subframe = build_subframe('app_category')

In [None]:
app_category_subframe.shape

# Subframe with the same site_domain

In [None]:
site_domain_subframe, site_domain_subframe2 = build_subframe('site_domain', 2)

In [None]:
site_domain_subframe.shape

# Subframe with the same app_domain

In [None]:
app_domain_subframe = build_subframe('app_domain')

In [None]:
app_domain_subframe.shape

# Hash categorical features

In [None]:
def hash_rem(obj):
    return hash(obj) % 1000000

In [None]:
def hash_strings(df):
    first = df.columns.get_loc('site_id')
    last = df.columns.get_loc('device_model')
    df_transform = df.iloc[:, first:last+1]
    df_transform = df_transform.applymap(hash_rem)
    new_frame = df.drop(df.iloc[:, first:last+1], axis=1)
    return pd.concat([new_frame, df_transform], axis=1)

# Applying model

In [None]:
import hyperopt
from numpy.random import RandomState

In [None]:
params_space = {
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
}

def find_best_model(X, y):
    def hyperopt_objective(params):
        model = CatBoostClassifier(
            learning_rate=params['learning_rate'],
            iterations=200,
            verbose=False,
            loss_function='Logloss',
            od_type='Iter',
            od_wait=10,
    )

        cv_data = cv(
            Pool(X, y, cat_features=categorical_features_indices),
            model.get_params(),
        )
        best_log_loss = np.min(cv_data['test-Logloss-mean'])

        return best_log_loss
    
    categorical_features_indices = np.where(X.dtypes != object)[0]
    
    trials = hyperopt.Trials()
    
    best = hyperopt.fmin(
        hyperopt_objective,
        space=params_space,
        algo=hyperopt.tpe.suggest,
        max_evals=10,
        trials=trials,
    )

    print(best)
    model = CatBoostClassifier(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=200,
    verbose=False,
    loss_function='Logloss',
    od_type='Iter',
    od_wait=10,
    )
    
    return model

In [None]:
def use_best_model(X, y):
    return CatBoostClassifier(
    l2_leaf_reg=2,
    learning_rate=0.1,
    iterations=200,
    verbose=False,
    loss_function='Logloss',
    od_type='Iter',
    od_wait=10,
    )

In [None]:
def apply_model(df):
    df = hash_strings(df)
    X = df.drop('click', axis=1)
    y = df['click']
    model = find_best_model(X, y)
    model.fit(X, y)
    print(model.tree_count_)
    return model

In [None]:
models = []
for subframe in [site_id_subframe2, app_id_subframe, \
                 site_category_subframe2, app_category_subframe, \
                 site_domain_subframe2, app_domain_subframe]:
    models.append(apply_model(subframe))

# Ensemble

In [None]:
def logistic_function(x):
    return 1 / (1 + np.exp(4-8*x))
def inversed_logistic(x):
    return 1 / 2 - np.log(1 / x - 1) / 8

In [None]:
def predict(models, X):
    results_frame = pd.DataFrame()
    for model in models:
        results_frame[len(results_frame.columns)] = inversed_logistic(model.predict_proba(X)[:, 1])
    results_frame = logistic_function(results_frame.agg([sum], axis=1) / len(results_frame.columns))
    return results_frame['sum']

In [None]:
# test_frame = pd.read_csv('test.csv', engine='c')
test_frame = pd.read_csv('/content/drive/My Drive/Colab Notebooks/train.csv', engine='c')

In [None]:
results_frame = test_frame[['id']]

In [None]:
test_frame.drop('id', axis=1, inplace=True)

In [None]:
add_weekday_hour(test_frame)

In [None]:
test_frame = hash_strings(test_frame)

In [None]:
test_frame.shape

In [None]:
results_frame['click'] = predict(models, test_frame)

In [None]:
results_frame.head()