In [1]:
#Script for faster calculation of Gini coefficient in python

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [2]:
#The function used in most kernels
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

In [3]:
a = np.random.randint(0,2,100000)
p = np.random.rand(100000)
print(a[10:15], p[10:15])

[0 1 1 1 0] [ 0.04320391  0.13821996  0.86650132  0.14792614  0.7026727 ]


In [4]:
%%time
gini_normalized(a,p)

CPU times: user 28 ms, sys: 8 ms, total: 36 ms
Wall time: 34.7 ms


0.0020964644437498465

In [5]:
#Remove redundant calls
def ginic(actual, pred):
    actual = np.asarray(actual) #In case, someone passes Series or list
    n = len(actual)
    a_s = actual[np.argsort(pred)]
    a_c = a_s.cumsum()
    giniSum = a_c.sum() / a_s.sum() - (n + 1) / 2.0
    return giniSum / n
 
def gini_normalizedc(a, p):
    if p.ndim == 2:#Required for sklearn wrapper
        p = p[:,1] #If proba array contains proba for both 0 and 1 classes, just pick class 1
    return ginic(a, p) / ginic(a, a)

In [6]:
%%time
gini_normalizedc(a,p)

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 13.8 ms


0.0020964644437498465

### Wrappers for different algorithms

In [7]:
#XGBoost
from sklearn import metrics
def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalizedc(labels, preds)
    return [('gini', gini_score)]

#LightGBM
def gini_lgb(actuals, preds):
    return 'gini', gini_normalizedc(actuals, preds), True

#SKlearn
gini_sklearn = metrics.make_scorer(gini_normalizedc, True, True)

### Cheers!

### Update:  sklearn example

In [8]:
train = pd.read_csv("../input/train.csv")
feats = [col for col in train.columns if col not in ['id','target']]

X = train[feats]
y = train['target']

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

#Initialize random forest
rfc = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=20, max_features=0.2, n_jobs=-1)

In [10]:
#Stratified validation startegy
cv_1 = StratifiedKFold(n_splits=5, random_state=1).split(X, y)

#Check cross validation scores
cross_val_score(rfc, X, y, cv=cv_1, scoring=gini_sklearn, verbose=1, n_jobs=-1)

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.4min finished


array([ 0.2462545 ,  0.24627452,  0.25112341,  0.24777486,  0.24711137])

### Cheers again!!!