In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.manifold import locally_linear_embedding, TSNE, Isomap, MDS
from sklearn.decomposition import PCA
import seaborn as sns
import os
import random
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import time
import lightgbm as lgb
from sklearn.metrics import log_loss, roc_auc_score, confusion_matrix, mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
import gc
from sklearn.linear_model import LogisticRegression

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
SEED = 42
TEST_SIZE = 0.25

In [4]:
############ RANDOMNESS

# seed function
def seed_everything(seed = 42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
# set seed
seed_everything(SEED)

In [5]:
data = pd.read_csv('./p2p.csv', index_col=0)
y = data.status
data.index = range(data.shape[0])

In [6]:
y.value_counts()

0    13413
1     1632
Name: status, dtype: int64

In [232]:
def reduction(data, reduction_type='LLE', add_features=True, n_neighbors=15, n_components=2, random_state=0, n_jobs=20):
   
    columns = [f'component{i+1}' for i in range(n_components)]
    
    if reduction_type=='LLE':
        X_r, err = locally_linear_embedding(data, n_neighbors=n_neighbors, 
                                                     n_components=n_components, random_state=random_state,
                                           n_jobs=n_jobs)    
    elif reduction_type=='TSNE':
        X_r = TSNE(n_components=n_components, random_state=random_state, n_jobs=n_jobs).fit_transform(data)    
    elif reduction_type=='PCA':
        pca = PCA(n_components=n_components, random_state=random_state)
        pca.fit(data)
        X_r = pca.transform(data)   
    elif reduction_type=='Isomap' or reduction_type=='ISOMAP':
        X_r = Isomap(n_components=n_components, n_neighbors=n_neighbors, n_jobs=n_jobs).fit_transform(data)    
    else:
        raise ValueError("unrecognized dimensionality reduction method")
    
    X_r = pd.DataFrame(X_r, columns=columns)
    
    if add_features==True:
        data = pd.concat([data, X_r], axis=1)
    else:
        data = X_r
    return data

In [233]:
# Uncomment to use the required dimensionality reduction technique

#data = reduction(data, reduction_type='LLE', add_features=True)
#data = reduction(data, reduction_type='TSNE', add_features=True)
#data = reduction(data, reduction_type='PCA', add_features=True)
#data = reduction(data, reduction_type='Isomap', add_features=True)

In [234]:
X_train, X_test, y_train, y_test = train_test_split(
                                     data, y, test_size=TEST_SIZE, random_state=SEED)

In [235]:
excluded_feats = ['status', 'nace']
features = [f for f in X_train.columns if f not in excluded_feats]

In [236]:

# cores
CORES = 20
# cross-validation
NUM_FOLD = 5
SHUFFLE   = True


# data partitinoing
folds = StratifiedKFold(n_splits = NUM_FOLD, random_state = SEED, shuffle = SHUFFLE)

############ PLACEHOLDERS

# placeholders
clfs = []
importances = pd.DataFrame()

# predictions
preds_oof    = pd.Series(np.zeros(X_train.shape[0]))

In [237]:
############ CROSS-VALIDATION LOOP
cv_start  = time.time()
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):

    # data partitioning
    trn_x, trn_y = X_train[features].iloc[trn_idx], y_train.iloc[trn_idx]
    val_x, val_y = X_train[features].iloc[val_idx], y_train.iloc[val_idx]
    test_x       = X_test[features]
         
    ## add noise to train to reduce overfitting
    #trn_x += np.random.normal(0, 0.1, trn_x.shape)
    
    # print data dimensions
    print('Data shape:', trn_x.shape, val_x.shape)
    #print('Data shape:', trn_y.shape, val_y.shape)    
    # train Ridge
    clf = LogisticRegression(random_state=SEED).fit(trn_x, trn_y)
    

    # save predictions
    preds_oof.iloc[val_idx] = clf.predict_proba(val_x)[:, 1]
    
    # print performance
    print('--------------------------------')
    print('FOLD%2d: AUC = %.6f' % (n_fold + 1, roc_auc_score(y_train.iloc[val_idx], preds_oof.iloc[val_idx])))
    print('--------------------------------')
    print('')
        
    # clear memory
    del trn_x, trn_y, val_x, val_y
    gc.collect()
    
    
# print overall performance    
cv_perf = roc_auc_score(y_train, preds_oof)
print('--------------------------------')
print('- OOF AUC = %.6f' % cv_perf)
print('- CV TIME = {:.2f} min'.format((time.time() - cv_start) / 60))
print('--------------------------------')

Data shape: (9026, 25) (2257, 25)
--------------------------------
FOLD 1: AUC = 0.816648
--------------------------------

Data shape: (9026, 25) (2257, 25)
--------------------------------
FOLD 2: AUC = 0.830000
--------------------------------

Data shape: (9026, 25) (2257, 25)
--------------------------------
FOLD 3: AUC = 0.801403
--------------------------------

Data shape: (9027, 25) (2256, 25)
--------------------------------
FOLD 4: AUC = 0.825533
--------------------------------

Data shape: (9027, 25) (2256, 25)
--------------------------------
FOLD 5: AUC = 0.788516
--------------------------------

--------------------------------
- OOF AUC = 0.812440
- CV TIME = 0.01 min
--------------------------------


In [238]:
clf = LogisticRegression(random_state=SEED).fit(X_train[features], y_train)

In [239]:
preds_test = clf.predict_proba(X_test[features])[:, 1]

In [240]:
roc_auc_score(y_test, preds_test)

0.7996475090664394