In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn import preprocessing
from sklearn import model_selection

In [3]:
#pip install lightgbm
import lightgbm 

In [4]:
train = pd.DataFrame({
    'user': [1, 2, 1, 2, 2],
    'banque': ['Societe Generale', 'Credit Lyonnais', 'Chinese National Bank', 'Chinese National Bank', 'QIWI'],
    'country': ['France', 'France', 'China', 'China', 'Russia'],
    'y': [0, 0, 1, 1, 1],
})

test  = pd.DataFrame({
    'user': [1, 2, 1, 2, 2],
    'banque': ['Societe Generale', 'Credit Lyonnais', 'Chinese National Bank', 'Chinese National Bank', 'QIWI'],
    'country': ['France', 'France', 'China', 'China', 'Russia'],
    'y': [0, 0, 1, 0, 1] ,
})

In [5]:
train = train.assign(test = False)
test  = test.assign(test = True)

In [6]:
df = pd.concat([train, test]).reset_index(drop = True)

## Features computation:

['min', 'max', 'sum', 'mean', 'std']

## Aggregate for Time series

In [7]:
def shifted_rolling_agg(df, AGGS, fillna=None):

    features = df.copy()

    for on, lag, by, hows in AGGS:

        agg = features.groupby(by)[on].apply(lambda x: (
            x
            .shift(1)
            .rolling(window=lag, min_periods=1)
            .agg(hows)
        ))

        agg = agg.rename(columns={
            how: f'{how}_of_last_{lag}_{on}_by_' + '_and_'.join(by)
            for how in hows
        })
        
        if fillna is not None:
            
            agg = agg.fillna(fillna)

        features = features.join(agg.astype(np.float32))
        
    return features

# AGGS = [
#   # on, lag, by, how
#   ('y', 2, ['banque'], ['mean']),
#    ('y', 2, ['user', 'banque'], ['mean']),
#    ('y', 2, ['user'], ['mean']),
# ]

# df = shifted_rolling_agg(df, AGGS, fillna = -1)

In [8]:
df.head()

Unnamed: 0,user,banque,country,y,test
0,1,Societe Generale,France,0,False
1,2,Credit Lyonnais,France,0,False
2,1,Chinese National Bank,China,1,False
3,2,Chinese National Bank,China,1,False
4,2,QIWI,Russia,1,False


## Aggregate:

In [9]:
def agg(df, AGGS, fillna=None):

    features = df.copy()

    for on, by, hows in AGGS:

        agg = features.groupby(by)[on].agg(hows).reset_index()
        
        names = {
            how: f'agg_{how}_{on}_by_' + '_and_'.join(by)
            for how in hows
        }

        agg = agg.rename(columns=names)
        
        for _, name in names.items():
        
            agg[name] = agg[name].astype(np.float32)
        
        if fillna is not None:
            
            agg = agg.fillna(fillna)
        
        features = pd.merge(left = features, right = agg, how = 'left', on = by)
        
    return features

#AGGS = [
#    # on, lag, by, how
#    ('y', ['banque'], ['mean']),
#    ('y', ['user', 'banque'], ['mean']),
#    ('y', ['user'], ['mean']),
#    ('y', ['country'], ['mean']),
#]

# df = agg(df, AGGS, fillna=-1)

In [10]:
df.head()

Unnamed: 0,user,banque,country,y,test
0,1,Societe Generale,France,0,False
1,2,Credit Lyonnais,France,0,False
2,1,Chinese National Bank,China,1,False
3,2,Chinese National Bank,China,1,False
4,2,QIWI,Russia,1,False


## Target encoding

In [11]:
def target_encoding(df, by, on, m):
    
    # Compute the global mean
    mean = df[on].mean()

    # Compute the number of values and the mean of each group
    agg = df.groupby(by)[on].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + m * mean) / (counts + m)

    # Replace each value by the according smoothed mean
    return df[by].map(smooth)

#AGGS = [
#    ('y', ['banque'], 0)
#]

#for on, by, m in AGGS:
    
#    name = f'target_encoding_on_{on}_by_{"_".join(by)}'
    
#    df[name] = target_encoding(df=df, by='banque', on='y', m=0.5)

In [32]:
df.head()

Unnamed: 0,user,banque,country,y,test
0,1,Societe Generale,France,0,False
1,2,Credit Lyonnais,France,0,False
2,1,Chinese National Bank,China,1,False
3,2,Chinese National Bank,China,1,False
4,2,QIWI,Russia,1,False


## Step since / Step until:

In [13]:
def steps_since(df, on, fillna=None, prefix=''):

    features = pd.DataFrame(index=df.index)

    for cat in df[on].astype('category').cat.categories:
        
        counts = df[on].eq(cat).cumsum()
        
        since = counts.groupby(counts).cumcount().astype('int64')
        
        since[counts.eq(0)] = pd.NA
        
        name = f'{prefix}{on}_steps_since_{cat}'
        
        features[name] = since
        
        if fillna is not None:
            
            features[name] = features[name].fillna(fillna)

    return features

# df = df.join(steps_since(df, on='banque', fillna=-1))

In [14]:
def grouped_steps_since(df, on, by, fillna=None):
    features = pd.concat(
        steps_since(df = group, on = on, fillna = fillna, prefix = f'agg_{"_".join(by)}_')
        for index, group in df.groupby(by)
    ).sort_index()
    
    if fillna is not None:
        
        features = features.fillna(fillna)
        
    return features

# df = df.join(grouped_steps_since(df=df, on='y', by=['banque'], fillna=-1))

## Label encoding

In [15]:
categories = []

In [16]:
label_encoder = {}

for category in categories:
    
    label_encoder[category] = preprocessing.LabelEncoder()
    
    df[category] = label_encoder[category].fit_transform(df[category])    

## Train test split:

In [17]:
numerical_columns = list(df.select_dtypes(include=np.number).columns)
numerical_columns

['user', 'y']

In [18]:
X_train = df[df['test'] == False][numerical_columns]
X_test  = df[df['test'] == False][numerical_columns]

#### Columns to drop:

In [19]:
columns_to_drop = []

In [20]:
X_train.drop(columns_to_drop, axis = 'columns', inplace = True)
X_test.drop(columns_to_drop, axis = 'columns', inplace = True)

#### Extract target

In [21]:
y_train = X_train['y']
X_train.drop('y', axis = 'columns', inplace = True)
X_test.drop('y', axis = 'columns', inplace = True)

In [22]:
X_train.head()

Unnamed: 0,user
0,1
1,2
2,1
3,2
4,2


In [23]:
X_test.head()

Unnamed: 0,user
0,1
1,2
2,1
3,2
4,2


#### Metric:

In [24]:
from sklearn import metrics

def custom_metric(oof, metric, by=None):
    if by is not None:
        grouped_scores = oof.groupby(by).apply(lambda group: metric(group.y, group.y_pred))
        print(grouped_scores)
        return grouped_scores.mean()
    else:
        return metric(oof.y, oof.y_pred)

#### CV with prediction:

In [25]:
parameters = {
    'num_leaves': 31, 
    'objective': 'binary', 
    'metric': ['binary_logloss'],
    'num_leaves': 10, 
    'max_depth': - 1, 
    'learning_rate': 0.1, 
    'n_estimators': 100,
    'random_state': 42,
    'two_round': True
}

cv = model_selection.KFold(5, shuffle = True)

model = lightgbm.LGBMClassifier(**parameters)

oof = []

submission = pd.DataFrame(index=X_test.index)

feature_importance = pd.DataFrame(index=X_train.columns)

for i, (fit, val) in enumerate(cv.split(X_train, y_train)): # groups
    
    print(f'\n Fold {i+1} \n')
    
    X_fit = X_train.iloc[fit].copy()
    X_val = X_train.iloc[val].copy()
    
    y_fit = y_train.iloc[fit].copy()
    y_val = y_train.iloc[val].copy()
    
    model.fit(X_fit, y_fit, eval_set = (X_val, y_val), early_stopping_rounds = 30)
    
    y_fit = pd.DataFrame(model.predict(X_fit), index = X_fit.index, columns = ['y_pred'])
    
    y_val = pd.DataFrame(model.predict(X_val), index = X_val.index, columns = ['y_pred'])
    
    submission[f'Fold {i}'] = model.predict(X_test)
    
    oof.append(
        pd.concat([train.loc[val], y_val], axis = 'columns')
    )
    
    feature_importance[f'Fold {i}'] = model.feature_importances_
    
    # Score for each fold on validation and train set:
    scores_validation = custom_metric(
        pd.concat([train.loc[val], y_val], axis = 'columns'), 
        metric=metrics.accuracy_score, 
        #by='banque',
    ) 
    
    scores_train = custom_metric(
        pd.concat([train.loc[fit], y_fit], axis = 'columns'), 
        metric=metrics.accuracy_score, 
        #by='banque'
    ) 
    
    print(f'Scores Train: {scores_train}, score valid: {scores_validation}')
    
# Concatenate out of fold prediction:
oof = pd.concat(oof).sort_index()

# Overall score
#scores_validation = custom_metric(
#    oof, 
#    metric=metrics.accuracy_score, 
    #by='banque'
#)

#print(f'\n Validation score: {scores_validation} \n')


# Submission
submission['y_pred'] = submission.mean(axis='columns')
#submission['y_pred'] = submission.mode(axis='columns')

submission = pd.concat([test, submission['y_pred']], axis = 'columns')


 Fold 1 

[1]	valid_0's binary_logloss: 0.693147
Training until validation scores don't improve for 30 rounds
[2]	valid_0's binary_logloss: 0.693147
[3]	valid_0's binary_logloss: 0.693147
[4]	valid_0's binary_logloss: 0.693147
[5]	valid_0's binary_logloss: 0.693147
[6]	valid_0's binary_logloss: 0.693147
[7]	valid_0's binary_logloss: 0.693147
[8]	valid_0's binary_logloss: 0.693147
[9]	valid_0's binary_logloss: 0.693147
[10]	valid_0's binary_logloss: 0.693147
[11]	valid_0's binary_logloss: 0.693147
[12]	valid_0's binary_logloss: 0.693147
[13]	valid_0's binary_logloss: 0.693147
[14]	valid_0's binary_logloss: 0.693147
[15]	valid_0's binary_logloss: 0.693147
[16]	valid_0's binary_logloss: 0.693147
[17]	valid_0's binary_logloss: 0.693147
[18]	valid_0's binary_logloss: 0.693147
[19]	valid_0's binary_logloss: 0.693147
[20]	valid_0's binary_logloss: 0.693147
[21]	valid_0's binary_logloss: 0.693147
[22]	valid_0's binary_logloss: 0.693147
[23]	valid_0's binary_logloss: 0.693147
[24]	valid_0's bi

## Feature importance:

In [26]:
pd.DataFrame(feature_importance.mean(axis = 'columns')).sort_values(0, ascending = False).head(10)

Unnamed: 0,0
user,0.0


In [27]:
submission.head()

Unnamed: 0,user,banque,country,y,test,y_pred
0,1,Societe Generale,France,0,True,0.4
1,2,Credit Lyonnais,France,0,True,0.4
2,1,Chinese National Bank,China,1,True,0.4
3,2,Chinese National Bank,China,0,True,0.4
4,2,QIWI,Russia,1,True,0.4


In [28]:
# If needed to rename the prediction variable:
submission.rename(columns= {'y_pred': 'prediction'})

Unnamed: 0,user,banque,country,y,test,prediction
0,1,Societe Generale,France,0,True,0.4
1,2,Credit Lyonnais,France,0,True,0.4
2,1,Chinese National Bank,China,1,True,0.4
3,2,Chinese National Bank,China,0,True,0.4
4,2,QIWI,Russia,1,True,0.4


In [29]:
# submission[['y_pred']].to_csv(f'./submissions/submission_{scores_validation}.csv')

In [30]:
### Handle date.

### Exportation LightGBM en C.

### Exports columns to csv for embeddings computation.

### Calcul embeddings avec pipeline kaggle.

### Installation OSIRIM + google Drive

### Treelite

#### Download:

```
!python3 -m pip install --user treelite treelite_runtime
```

On Mac OSX:

```
!brew install libomp
```

#### Build from source:

```
git clone https://github.com/dmlc/treelite.git
cd treelite

mkdir build
cd build
cmake ..

Setup tools is needed to build from source

pip install -U pip setuptools

# Install treelite
cd python
python3 setup.py install --user
# Install treelite_runtime
cd ../runtime/python
python3 setup.py install --user

Set the environment variable PYTHONPATH to locate Treelite package

export PYTHONPATH=/path/to/treelite/python:/path/to/treelite/runtime/python
python3          # enter interactive session
```



In [31]:
def export_model_treelite(model, extension='so', toolchain='gcc'):
    # so for Unix and dylib for macos
    import treelite
    model.booster_.save_model('model.txt')
    treelite_model = treelite.Model.load('model.txt', model_format='lightgbm')
    treelite_model.export_lib(toolchain=toolchain, libpath=f'model.{extension}', params={'parallel_comp': 32}, verbose=True)
    
def load_model_treelite(extension='so'):
    # TARGET MACHINE
    import treelite_runtime
    return treelite_runtime.Predictor(f'model.{extension}', verbose=True)

def predict_treelite(predictor, X):
    # TARGET MACHINE
    import treelite_runtime
    batch = treelite_runtime.Batch.from_npy2d(X.values)
    return predictor.predict(batch, pred_margin=False)

#export_model_treelite(model=model, extension='so')
# predictor = load_model_treelite()
# predict_treelite(predictor, X_train)

### Temporal validation: