In [2]:
import pandas as pd
import numpy as np

import featuretools as ft
import os
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=FutureWarning, module='woodwork.*')
warnings.filterwarnings('ignore', category=UserWarning, module='woodwork.*')


In [2]:
#!pip install --upgrade numpy pandas matplotlib seaborn woodwork featuretools scikit-learn pyarrow

In [3]:
#!pip install --upgrade dask distributed nodejs dask-labextension

In [4]:
def entityset_from_partition(path):
    
    """Create an EntitySet from a partition of data specified as a path.
      Returns a dictionary with the entityset and the number used for saving the feature matrix."""
    warnings.filterwarnings('ignore')
    warnings.filterwarnings('ignore', category=FutureWarning, module='woodwork.*')
    warnings.filterwarnings('ignore', category=UserWarning, module='woodwork.*')
    partition_num = int(path[20:])
    
    # Read in data
    app = pd.read_csv('%s/app.csv' % path)
    bureau = pd.read_csv('%s/bureau.csv' % path)
    bureau_balance = pd.read_csv('%s/bureau_balance.csv' % path)
    previous = pd.read_csv('%s/previous.csv' % path)
    credit = pd.read_csv('%s/credit.csv' % path)
    installments = pd.read_csv('%s/installments.csv' % path)
    cash = pd.read_csv('%s/cash.csv' % path)
    
    
    # Empty entityset
    es = ft.EntitySet(id = 'clients')
    
    es = es.add_dataframe(dataframe_name = 'app', dataframe = app, 
                                 index = 'SK_ID_CURR')
    es = es.add_dataframe(dataframe_name = 'bureau', dataframe = bureau, 
                                 index = 'SK_ID_BUREAU')
    
    es = es.add_dataframe(dataframe_name = 'previous', dataframe = previous, 
                                 index = 'SK_ID_PREV')
    
    # Entities that do not have a unique index
    es = es.add_dataframe(dataframe_name = 'bureau_balance', dataframe = bureau_balance, 
                                 make_index = True, index = 'bb_index'
                                 )
    
    es = es.add_dataframe(dataframe_name = 'cash', dataframe = cash, 
                                 make_index = True, index = 'cash_index'
                                 )
    
    es = es.add_dataframe(dataframe_name = 'installments', dataframe = installments,
                                 make_index = True, index = 'installments_index'
                                 )
    
    es = es.add_dataframe(dataframe_name = 'credit', dataframe = credit,
                                 make_index = True, index = 'credit_index'
                                 )
    
    es = es.add_relationship(parent_dataframe_name='app', 
                                 parent_column_name='SK_ID_CURR', 
                                 child_dataframe_name='bureau', 
                                 child_column_name='SK_ID_CURR')
    
    es = es.add_relationship(parent_dataframe_name='bureau', 
                                    parent_column_name='SK_ID_BUREAU', 
                                    child_dataframe_name='bureau_balance', 
                                    child_column_name='SK_ID_BUREAU')
    es = es.add_relationship(parent_dataframe_name='app', 
                                    parent_column_name='SK_ID_CURR', 
                                    child_dataframe_name='previous', 
                                    child_column_name='SK_ID_CURR')
    es = es.add_relationship(parent_dataframe_name='previous', 
                                    parent_column_name='SK_ID_PREV', 
                                    child_dataframe_name='cash', 
                                    child_column_name='SK_ID_PREV')
    es = es.add_relationship(parent_dataframe_name='previous', 
                                             parent_column_name='SK_ID_PREV', 
                                             child_dataframe_name='installments', 
                                             child_column_name='SK_ID_PREV')
    es = es.add_relationship(parent_dataframe_name='previous', 
                                       parent_column_name='SK_ID_PREV', 
                                       child_dataframe_name='credit', 
                                       child_column_name='SK_ID_PREV')

    return ({'es': es, 'num': partition_num})

In [5]:
def feature_matrix_from_entityset(es_dict, feature_defs, return_fm = False):
    import os

    """Run deep feature synthesis from an entityset and feature definitions. 
    Saves feature matrix based on partition.""" 
    
    # Extract the entityset
    es = es_dict['es']
    
    #Calculate the feature matrix and save
    feature_matrix = ft.calculate_feature_matrix(feature_defs,
                                                 entityset=es, 
                                                 n_jobs = 1, 
                                                 verbose = 0,
                                                 chunk_size = es['app'].shape[0])
    
    directory = './input/fm'
    if os.path.exists(directory):
        print("directory exists")
    else:
        os.makedirs(directory)

    feature_matrix.to_csv('./input/fm/p%d_fm.csv' % es_dict['num'], index = True)
    
    if return_fm:
        return feature_matrix

In [6]:
feature_defs = ft.load_features('./input/features.txt')
print(len(feature_defs))

2080


In [7]:
import dask.bag as db
from dask.distributed import Client

# Use all 8 cores
client = Client(processes = True)

In [8]:
paths = ['./input/partitions/p%d' %  i for i in range(1, 105)]


In [None]:
# Create a bag object
b = db.from_sequence(paths)

# Map entityset function
b = b.map(entityset_from_partition)

# Map feature matrix function
b = b.map(feature_matrix_from_entityset, feature_defs = feature_defs)
with warnings.catch_warnings():
    warnings.simplefilter('ignore', FutureWarning) 
    b.compute()

In [3]:
base = './input/fm/'
fm_paths = [base + p for p in os.listdir(base) if 'fm.csv' in p]
fms = [pd.read_csv(path) for path in fm_paths]
feature_matrix = pd.concat(fms, axis = 0)

In [4]:
feature_matrix

Unnamed: 0,SK_ID_CURR,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,...,SUM(credit.previous.DAYS_FIRST_DUE),SUM(credit.previous.DAYS_LAST_DUE),SUM(credit.previous.DAYS_LAST_DUE_1ST_VERSION),SUM(credit.previous.DAYS_TERMINATION),SUM(credit.previous.HOUR_APPR_PROCESS_START),SUM(credit.previous.NFLAG_INSURED_ON_APPROVAL),SUM(credit.previous.RATE_DOWN_PAYMENT),SUM(credit.previous.RATE_INTEREST_PRIMARY),SUM(credit.previous.RATE_INTEREST_PRIVILEGED),SUM(credit.previous.SELLERPLACE_AREA)
0,108058,35685.0,876154.5,769500.0,157500.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,108059,28561.5,730017.0,652500.0,450000.0,0.0,0.0,5.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,108060,31707.0,1080000.0,1080000.0,225000.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,108061,24723.0,505665.0,355500.0,121500.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,108062,16096.5,495000.0,495000.0,144000.0,0.0,0.0,1.0,0.0,0.0,...,-70658.0,-55762.0,0.0,-5341.0,784.0,0.0,0.0,0.0,0.0,-49.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3453,428398,29245.5,539100.0,450000.0,135000.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3454,428399,37800.0,1288350.0,1125000.0,126000.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3455,428400,31333.5,808650.0,675000.0,112500.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3456,428401,38808.0,728460.0,675000.0,112500.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1173.0,0.0,0.0,0.0,0.0,162702.0


In [5]:
feature_matrix.to_csv("./input/general_features.csv")