In [1]:
#import pandas as pd
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error,accuracy_score
from sklearn.model_selection import train_test_split, ParameterGrid,StratifiedKFold, KFold
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm
import time

RANDOM_STATE = 2
cat_features = [1]

## CATBOOST

In [None]:
def get_data_train():
    start = time.time()
    data = pd.merge(pd.read_excel('task2.xlsb', sheet_name = "data1", engine='pyxlsb'),\
                    pd.read_excel('task2.xlsb', sheet_name = "data2", engine='pyxlsb'), how = "outer")
    print(data.shape)
    print(data.columns)
    data = data.dropna()
    print(data.shape)
    columns = ['session_id', 'channel_id', 'weekday_session', 'hour_session',
       'session_price', 'flight_type', 'multifr_type', 'validating_airline_id',
       'adults_count', 'children_count', 'infants_count', 'service_class_id',
       'days_to_flight', 'departure_day', 'departure_weekday',
       'departure_hour', 'adapter_id', 'from_destination_id',
       'to_destination_id', 'meta_flight_type']
    
    X = data[columns]
    y = data['target_result']
    
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y, 
                                                        shuffle=True,
                                                        random_state=RANDOM_STATE,
                                                        train_size=0.8
                                                        )
   # print(X_train.head())
    
    print( "TIME IS " , time.time() - start) 
    return X_train, X_test, y_train, y_test


    
X_train, X_test, y_train, y_test = get_data_train()
    
        

In [None]:
def get_data_predict():
    start = time.time()
    data = pd.read_excel('task2.xlsb', sheet_name = 'test_data', engine='pyxlsb')
    print(data.shape)
    print(data.columns)
    
    columns = ['session_id', 'channel_id', 'weekday_session', 'hour_session',
       'session_price', 'flight_type', 'multifr_type', 'validating_airline_id',
       'adults_count', 'children_count', 'infants_count', 'service_class_id',
       'days_to_flight', 'departure_day', 'departure_weekday',
       'departure_hour', 'adapter_id', 'from_destination_id',
       'to_destination_id', 'meta_flight_type']
    
    X = data[columns]
                            
    
    print( "TIME IS " , time.time() - start) 
    return X


    
X_predict = get_data_predict()
    
        

In [None]:
def chunkIt(seq, num):
    avg = len(seq) / float(num)
    out = []
    last = 0.0

    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg

    return out

In [None]:
def cross_val(X, y, X_test, param, y_test, cat_features, n_splits=5):
    
    acc = []
    predict = None
    
    X_train_ind = chunkIt(X.index, n_splits)
    X_test_ind = chunkIt(X_test.index, n_splits)
    y_train_ind = chunkIt(y.index, n_splits)
    y_test_ind = chunkIt(y_test.index, n_splits)


    for i in range(len(X_train_ind)):
       
        X_train, X_valid = X.loc[X_train_ind[i]], X_test.loc[X_test_ind[i]]
        y_train, y_valid = y.loc[y_train_ind[i]], y_test.loc[y_test_ind[i]]
        
        clf = CatBoostClassifier(iterations=500,
                                loss_function = param['loss_function'],
                                depth=param['depth'],
                                l2_leaf_reg = param['l2_leaf_reg'],
                                eval_metric = 'Accuracy',
                                leaf_estimation_iterations = 10,
                                use_best_model=True,
                                logging_level='Silent'
        )
        
        clf.fit(X_train, 
                y_train,
               # cat_features=cat_features,
                eval_set=(X_valid, y_valid)
        )
        
        y_pred = clf.predict(X_valid)
        accuracy = accuracy_score(y_valid, y_pred)
        acc.append(accuracy)
        
    return sum(acc)/n_splits
   

In [None]:
def catboost_GridSearchCV(X, y, X_test, params, y_test, cat_features, n_splits=10):
    ps = {'acc':0,
          'param': []
    }
    
    predict=None
    
    for prms in tqdm(list(ParameterGrid(params)), ascii=True, desc='Params Tuning:'):
                          
        acc = cross_val(X, y, X_test, prms, y_test, cat_features, n_splits=5)

        if acc>ps['acc']:
            ps['acc'] = acc
            ps['param'] = prms
    print('Acc: '+str(ps['acc']))
    print('Params: '+str(ps['param']))
    
    return ps['param']

In [None]:
params = {'depth':[2, 3, 4, 8],
          'loss_function': ['Logloss', 'CrossEntropy'],
          'l2_leaf_reg':np.logspace(-20, -19, 3),
          'learning_rate': [0.05, 0.1]
         }
    
param = catboost_GridSearchCV(X_train, y_train, X_test, params, y_test, cat_features)
    


In [None]:
clf = CatBoostClassifier(iterations=500,
                            loss_function = param['loss_function'],
                            depth=param['depth'],
                            l2_leaf_reg = param['l2_leaf_reg'],
                            eval_metric = 'Accuracy',
                            leaf_estimation_iterations = 10,
                            use_best_model=True
    )
X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                                    y_train, 
                                                    shuffle=True,
                                                    random_state=RANDOM_STATE,
                                                    train_size=0.8,
                                                    stratify=y_train
    )
clf.fit(X_train, 
        y_train,
        #cat_features=cat_features,
        logging_level='Silent',
        eval_set=(X_valid, y_valid)
    )


In [None]:
result = clf.predict(X_predict)

In [None]:
result.to_excel("solution.xlsx")