In [1]:
import sklearn
import pandas as pd
import pandahouse as ph
import numpy as np

In [18]:

HOST = ''
DB = ''

#чтение данных с кликхауса или файла
def readClickHouse(query = '', fileName = '1.csv', host = HOST, db = DB):
    if host != '':
        data = ph.read_clickhouse(query, connection={'host': host, 'database': db})
        open(fileName, 'w').write(data.to_csv())
    else:
        try:
            f = open(fileName)
        except IOError as e:
            print('Не удалось открыть файл')
        else:
            data = pd.read_csv(fileName, sep = ',')
            data = data[[x for x in data.columns if x != 'Unnamed: 0']]
    return data

In [3]:
def getTrainQuery(offer_id = 3, date = 'yesterday()'):
    return '''
    select 
    session_id,
    min(datetime) as start_time,
    max(datetime) as end_time,
    max(datetime) - min(datetime) as duration,
    count(*) as clicks_count,
    max(depth) as max_depth,
    countIf(order_id != '') as orders_count,
    countIf(page_type = 'basket') as basket_count,
    countIf(page_type = 'card') as card_count,
    countIf(page_type = 'category') as category_count,
    countIf(page_type = 'home') as home_count,
    countIf(page_type = 'none') as none_count,
    countIf(page_type = 'order') as order_count,
    countIf(page_type = 'other') as other_count,
    countIf(page_type = 'typ') as typ_count,
    clicks_count - sum(is_internal) as outside_count,
    sum(is_internal) as inside_count,
    anyLast(source) as last_source,
    uniq(source) as source_count
    from {db}.rtb
    where offer_id = ''' + str(offer_id) + ''' and date = ''' + date + '''
    group by session_id
'''

def getTestQuery(offer_id = 3, date = 'today()'):
    return '''
    select  
    session_id,
    min(datetime) as start_time,
    max(datetime) as end_time,
    max(datetime) - min(datetime) as duration,
    count(*) as clicks_count,
    max(depth) as max_depth,
    countIf(order_id != '') as orders_count,
    countIf(page_type = 'basket') as basket_count,
    countIf(page_type = 'card') as card_count,
    countIf(page_type = 'category') as category_count,
    countIf(page_type = 'home') as home_count,
    countIf(page_type = 'none') as none_count,
    countIf(page_type = 'order') as order_count,
    countIf(page_type = 'other') as other_count,
    countIf(page_type = 'typ') as typ_count,
    clicks_count - sum(is_internal) as outside_count,
    sum(is_internal) as inside_count,
    anyLast(source) as last_source,
    uniq(source) as source_count
    from {db}.rtb
    where offer_id = ''' + str(offer_id) + ''' and date = ''' + date + '''
    group by session_id
'''

In [5]:
# Загружаем данные из файлов
train = readClickHouse(getTrainQuery(3, 'yesterday()'), 'train_.csv', '')
test = readClickHouse(getTestQuery(3, 'today()'), 'test_.csv', '')

In [6]:
# Удаляем неиспользуемые столбцы
DROP_COL = ['start_time', 'end_time', 'last_source', 'session_id', 'typ_count']
train.loc[train['typ_count'] > 0, 'typ_count'] = 1

X_train = np.array(train.drop(DROP_COL, axis=1))
y_train = np.array(train['typ_count'])
X_test = np.array(test.drop(DROP_COL, axis=1))

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104657 entries, 0 to 104656
Data columns (total 19 columns):
session_id        104657 non-null object
start_time        104657 non-null object
end_time          104657 non-null object
duration          104657 non-null int64
clicks_count      104657 non-null int64
max_depth         104657 non-null int64
orders_count      104657 non-null int64
basket_count      104657 non-null int64
card_count        104657 non-null int64
category_count    104657 non-null int64
home_count        104657 non-null int64
none_count        104657 non-null int64
order_count       104657 non-null int64
other_count       104657 non-null int64
typ_count         104657 non-null int64
outside_count     104657 non-null int64
inside_count      104657 non-null int64
last_source       59728 non-null object
source_count      104657 non-null int64
dtypes: int64(15), object(4)
memory usage: 15.2+ MB


In [9]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

def cross_val_predict_proba(estimator, X_train, y_train, X_test, random_state=None, n_splits=5):
    y_test = np.zeros((len(X_test), n_splits), np.float32)
    
    kfold = KFold(n_splits=n_splits, 
                  shuffle=True,
                  random_state=random_state)

    y_predict = np.zeros_like(y_train, np.float32)
    for i, (train_idx, test_idx) in enumerate(kfold.split(y_train)):
        estimator.fit(X_train[train_idx], y_train[train_idx])
        y_predict[test_idx] = estimator.predict_proba(X_train[test_idx])[:, 1]
        y_test[:, i] = estimator.predict_proba(X_test)[:, 1]
    
    return y_predict, np.mean(y_test, axis=1)

# TODO: подобрать гиперпараметры отдельных моделей

# инициализирем модели с подобранными гиперпараметрами
estimators = [RandomForestClassifier(random_state=54232), 
              ExtraTreesClassifier(random_state=23412),
              AdaBoostClassifier(random_state=24212), 
              GradientBoostingClassifier(random_state=2732982)]

# получаем предсказания вероятностей ансамблей на кросс-валидации для обучающей выборки
predicted = [cross_val_predict_proba(est, X_train, y_train, X_test) for est in estimators]

X_train_stack = np.stack([p[0] for p in predicted], axis=1)
X_test_stack = np.stack([p[1] for p in predicted], axis=1)

In [33]:
# df123 = pd.DataFrame(X_test_stack)
# open('x_test_stack.csv', 'w').write(df123.to_csv())
# df123 = pd.DataFrame(X_train_stack)
# open('x_train_stack.csv', 'w').write(df123.to_csv())

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

kfold = KFold(shuffle=True, n_splits=4, random_state=19746)
params = {'class_weight': ['balanced', None],
          'penalty': ['l1', 'l2'],
          'C': [0.4, 0.5, 1., 2., 2.5, 3., 3.5, 4.]}

grid = GridSearchCV(LogisticRegression(), params, scoring='neg_log_loss', cv=kfold)


In [11]:
grid.fit(X_train_stack, y_train)


GridSearchCV(cv=KFold(n_splits=4, random_state=19746, shuffle=True),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'class_weight': ['balanced', None], 'penalty': ['l1', 'l2'], 'C': [0.4, 0.5, 1.0, 2.0, 2.5, 3.0, 3.5, 4.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_log_loss', verbose=0)

In [12]:
pd.DataFrame(grid.cv_results_).sort_values('rank_test_score')[:3]



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,param_class_weight,param_penalty,params,rank_test_score,split0_test_score,...,split1_test_score,split1_train_score,split2_test_score,split2_train_score,split3_test_score,split3_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
26,11.361621,0.016202,-0.028467,-0.028407,3.5,,l1,"{'class_weight': None, 'penalty': 'l1', 'C': 3.5}",1,-0.028033,...,-0.028089,-0.028499,-0.029276,-0.028095,-0.02847,-0.028421,2.417879,0.008347,0.000496,0.000193
18,10.627884,0.010901,-0.028504,-0.028434,2.5,,l1,"{'class_weight': None, 'penalty': 'l1', 'C': 2.5}",2,-0.028041,...,-0.028106,-0.02855,-0.029444,-0.02819,-0.028426,-0.028379,2.940321,0.005246,0.000562,0.000166
30,8.484571,0.015911,-0.028575,-0.028502,4.0,,l1,"{'class_weight': None, 'penalty': 'l1', 'C': 4.0}",3,-0.028008,...,-0.028089,-0.028492,-0.02938,-0.028151,-0.028825,-0.028774,1.838608,0.008096,0.000563,0.000226


In [13]:
grid.best_estimator_.coef_

array([[ 1.52111135,  2.48564648, 89.76939399,  1.3127938 ]])

In [14]:
grid.best_estimator_.fit(X_train_stack, y_train)

LogisticRegression(C=3.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
grid.best_estimator_.coef_

array([[  1.41159104,   2.35318936, 103.72825921,   0.83373491]])

In [16]:
predicted = grid.best_estimator_.predict(X_test_stack)

In [17]:
test_session_id = test['session_id']
test_real_typ = test['typ_count']

count_false_false = 0
count_false_true = 0
count_true_false = 0
count_true_true = 0

with open('submission.csv', 'w') as out:
    out.write('Session_id,Typ,RealTyp\n')
    for session, y, z in zip(test_session_id, predicted, test_real_typ):
        if (y == 0 and z == 0):
            count_false_false += 1
        if (y == 0 and z >= 1):
            count_false_true += 1
        if (y == 1 and z == 0):
            count_true_false += 1
        if (y == 1 and z >= 1):
            count_true_true += 1
        out.write('%s,%s,%s\n' % (session, y, z))
        
        
print(count_false_false)#94104
print(count_false_true)#868
print(count_true_false)#178
print(count_true_true)#337


94104
868
178
337
