# Advanced Boosting Models

In [1]:
import numpy as np 
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 
import math
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
import catboost
from datetime import date, datetime

In [15]:
#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector(BaseEstimator, TransformerMixin):
   
    def __init__( self, feature_names ):
        self._feature_names = feature_names 
     
    def fit( self, X, y = None ):
        return self 
    
    def transform( self, X, y = None ):
        return X[ self._feature_names ] 

In [16]:
class IntTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        self.fix_rubles(X)
        self.log_(X)
        self.fill_na(X)
        self.fix_dates(X)
        return X.drop(['TP_CHANGES_NUM', 'ACT_DATE', 'BIRTHDAY', 'USER_ID'], axis=1).values
     
    def fill_na(self, X):
        col_list = ['REFILL_OCT_16', 'REFILL_NOV_16', 'OUTGOING_OCT_16', 'OUTGOING_NOV_16',
       'GPRS_OCT_16', 'GPRS_NOV_16', 'OBLIG_NUM', 'REVENUE_OCT_16', 'REVENUE_NOV_16', 'LOG_TP_CHANGES_NUM']
        
        X[col_list] = df.loc[:, col_list].fillna(0)
         
        
    def log_(self, X):
        X['LOG_TP_CHANGES_NUM'] = np.log(df.TP_CHANGES_NUM)
        X['LOG_TP_CHANGES_NUM'].replace(-math.inf, 0)
       
        
    def fix_rubles(self, X):
        X['REFILL_OCT_16'] = 0.0001 * X['REFILL_OCT_16']
        X['REFILL_NOV_16'] = 0.0001 *X['REFILL_NOV_16']
    
    def fix_dates(self, X):
        def calculate_age(born):    
            return (today - born)/np.timedelta64(1,'Y')

        def calculate_months(beg):
            return (today - beg)/np.timedelta64(1,'M')
        today = datetime(2016,11,1)
        X['AGE'] = X['BIRTHDAY'].apply(
            lambda x: calculate_age(pd.to_datetime(x)))
        X['DURATION_M'] = X['ACT_DATE'].apply(lambda x: int(calculate_months(pd.to_datetime(x))))

In [17]:
class CatTransformer(BaseEstimator, TransformerMixin):
  
    def __init__(self):
        self.preserve = []    
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        self.fill(X)
        self.add_match(X)
        return X.drop(['DEVICE_TYPE_BUS', 'ASSET_TYPE_LAST', 'START_PACK',
                       'OFFER_GROUP', 'PORTED_IN', 'PORTED_OUT'], axis=1).values

    def add_match(self, X):
        X['DEVICE_MATCH'] = np.where(X['ASSET_TYPE_LAST'] == X['DEVICE_TYPE_BUS'], 1, 0)
        
    def fill(self, X):
        X['USAGE_AREA'] = X.USAGE_AREA.fillna('Undefined')
        X['DEVICE_TYPE_BUS'] = np.where(X['DEVICE_TYPE_BUS'] == 'Smartphone', 1, 0)
        X['MLLS_STATE'] = df['MLLS_STATE'].fillna('Undefined')
        X['OBLIG_ON_START'] = X['OBLIG_ON_START'].replace(False, 0)
        X['OBLIG_ON_START'] = X['OBLIG_ON_START'].replace(True, 1)
        X['Combined'] = X[['START_PACK','OFFER_GROUP']].fillna('').sum(axis=1)
         

In [18]:
df = pd.read_csv('homework_05//train.csv',sep = ';')
pd.set_option('display.max_columns', 500)

X = df.drop(['ACTIVITY_DEC_16'], axis=1)
y = df.ACTIVITY_DEC_16
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.2, random_state=42)

In [19]:
int_pipeline = Pipeline(
    steps=[
        ('int_selector', FeatureSelector(['REFILL_OCT_16', 'ACT_DATE', 'BIRTHDAY',
           'REFILL_NOV_16', 'OUTGOING_OCT_16', 'OUTGOING_NOV_16', 'GPRS_OCT_16',
           'GPRS_NOV_16', 'REVENUE_OCT_16', 'REVENUE_NOV_16', 'USER_ID', 'OBLIG_NUM', 'TP_CHANGES_NUM'])),
        ('int_transformer', IntTransformer())
    ]
)

categorial_pipeline = Pipeline(
    steps=[
        ('cat_selector', FeatureSelector(['USAGE_AREA', 'MLLS_STATE', 'ASSET_TYPE_LAST', 
                                          'DEVICE_TYPE_BUS', 'OBLIG_ON_START', 'START_PACK',
                                            'OFFER_GROUP','PORTED_IN', 'PORTED_OUT'])),
        ('cat_transformer', CatTransformer()),
        ('cat_encoder', OneHotEncoder())
    ]
)

rest_pipeline = Pipeline(
    steps=[
        ('no_proc_selector', FeatureSelector(['GENDER'])),
        ('no_proc_imputer', SimpleImputer(strategy='most_frequent')),
        ('no_proc_encoder', OneHotEncoder())
    ]
)

In [20]:
pipeline = FeatureUnion(transformer_list= [
    ('int', int_pipeline),
    ('cat', categorial_pipeline),
    ('no_proc', rest_pipeline)
])

# lightgbm

In [254]:
import lightgbm as lgb
lgb_params = [
    {
        'max_depth': [5, 7,  11],
        'learning_rate': [0.1, 0.15],
        'n_estimators': [100, 150, 200],
        
    }
]

lgbm = Pipeline(
    steps=[
        ('preprocessing', pipeline),
        ('model', GridSearchCV(lgb.LGBMClassifier(
            random_state=42,
            n_jobs=-1
        ),
        lgb_params, n_jobs=-1, refit=True, cv=3, 
        scoring='accuracy'))
    ]
)

In [255]:
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.9246833276275248


In [259]:
lgbm_cv = cross_val_score(lgbm['model'].best_estimator_, pipeline.fit_transform(X_train), y=y_train, cv=3)
lgbm_cv

array([0.92540763, 0.9302863 , 0.92064715])