In [1]:
import pandas as pd
import numpy as np

import featuretools as ft 

In [2]:
#!pip install --upgrade numpy pandas matplotlib seaborn woodwork featuretools scikit-learn pyarrow

In [3]:
#!pip install --upgrade dask distributed nodejs dask-labextension

In [4]:
app_train = pd.read_csv('./application_train.csv')
app_test = pd.read_csv('./application_test.csv')
bureau = pd.read_csv('./bureau.csv')
bureau_balance = pd.read_csv('./bureau_balance.csv')
cash = pd.read_csv('./POS_CASH_balance.csv')
credit = pd.read_csv('./credit_card_balance.csv')
previous = pd.read_csv('./previous_application.csv')
installments = pd.read_csv('./installments_payments.csv')

In [5]:
app_test['TARGET'] = np.nan
app = pd.concat([app_train, app_test], ignore_index=True, sort=True)

In [6]:
import sys

def return_size(df):
    """Return size of dataframe in gigabytes"""
    return round(sys.getsizeof(df) / 1e9, 2)

def convert_types(df):
    """Convert pandas data types for memory reduction."""
    
    # Iterate through each column
    for c in df:
        
        # Convert ids and booleans to integers
        if ('SK_ID' in c):
            df[c] = df[c].fillna(0).astype(np.int32)
            
        # Convert objects to category
        elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
            df[c] = df[c].astype('category')
        
        # Booleans mapped to integers
        elif set(df[c].unique()) == {0, 1}:
            df[c] = df[c].astype(bool)
        
        # Float64 to float32
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)
            
        # Int64 to int32
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)
        
    return df

In [7]:
app = convert_types(app)
bureau = convert_types(bureau)
bureau_balance = convert_types(bureau_balance)
cash = convert_types(cash)
credit = convert_types(credit)
previous = convert_types(previous)
installments = convert_types(installments)

In [8]:
def replace_day_outliers(df):
    """Replace 365243 with np.nan in any columns with DAYS"""
    for col in df.columns:
        if "DAYS" in col:
            df[col] = df[col].replace({365243: np.nan})

    return df

app = replace_day_outliers(app)
bureau = replace_day_outliers(bureau)
bureau_balance = replace_day_outliers(bureau_balance)
credit = replace_day_outliers(credit)
cash = replace_day_outliers(cash)
previous = replace_day_outliers(previous)
installments = replace_day_outliers(installments)

In [9]:
cash = cash.drop(columns=["SK_ID_CURR"])
credit = credit.drop(columns=["SK_ID_CURR"])
installments = installments.drop(columns=["SK_ID_CURR"])

In [10]:
es = ft.EntitySet(id = 'clients')

In [11]:
# Entities with a unique index
es = es.add_dataframe(dataframe_name = 'app', dataframe = app, 
                              index = 'SK_ID_CURR'
                              )
es = es.add_dataframe(dataframe_name = 'bureau', dataframe = bureau, 
                              index = 'SK_ID_BUREAU')

es = es.add_dataframe(dataframe_name = 'previous', dataframe = previous, 
                              index = 'SK_ID_PREV', 
                              )

# Entities that do not have a unique index
es = es.add_dataframe(dataframe_name = 'bureau_balance', dataframe = bureau_balance, 
                              make_index = True, index = 'bb_index',
                              )

es = es.add_dataframe(dataframe_name = 'cash', dataframe = cash, 
                              make_index = True, index = 'cash_index',
                             )

es = es.add_dataframe(dataframe_name = 'installments', dataframe = installments,
                              make_index = True, index = 'installments_index',
                              )

es = es.add_dataframe(dataframe_name = 'credit', dataframe = credit,
                              make_index = True, index = 'credit_index',
                              )

  if pdtypes.is_categorical_dtype(series.dtype):
  pdtypes.is_categorical_dtype(series.dtype)
  if pdtypes.is_categorical_dtype(series.dtype):
  pdtypes.is_categorical_dtype(series.dtype)
  if pdtypes.is_categorical_dtype(series.dtype):
  pdtypes.is_categorical_dtype(series.dtype)
  if pdtypes.is_categorical_dtype(series.dtype):
  pdtypes.is_categorical_dtype(series.dtype)
  if pdtypes.is_categorical_dtype(series.dtype):
  pdtypes.is_categorical_dtype(series.dtype)
  if pdtypes.is_categorical_dtype(series.dtype):
  pdtypes.is_categorical_dtype(series.dtype)
  if pdtypes.is_categorical_dtype(series.dtype):
  pdtypes.is_categorical_dtype(series.dtype)
  if pdtypes.is_categorical_dtype(series.dtype):
  pdtypes.is_categorical_dtype(series.dtype)
  if pdtypes.is_categorical_dtype(series.dtype):
  pdtypes.is_categorical_dtype(series.dtype)
  if pdtypes.is_categorical_dtype(series.dtype):
  pdtypes.is_categorical_dtype(series.dtype)
  if pdtypes.is_categorical_dtype(series.dtype):
  pdtypes.i

In [12]:
es = es.add_relationship(parent_dataframe_name='app', 
                               parent_column_name='SK_ID_CURR', 
                               child_dataframe_name='bureau', 
                               child_column_name='SK_ID_CURR')

es = es.add_relationship(parent_dataframe_name='bureau', 
                                   parent_column_name='SK_ID_BUREAU', 
                                   child_dataframe_name='bureau_balance', 
                                   child_column_name='SK_ID_BUREAU')
es = es.add_relationship(parent_dataframe_name='app', 
                                 parent_column_name='SK_ID_CURR', 
                                 child_dataframe_name='previous', 
                                 child_column_name='SK_ID_CURR')
es = es.add_relationship(parent_dataframe_name='previous', 
                                  parent_column_name='SK_ID_PREV', 
                                  child_dataframe_name='cash', 
                                  child_column_name='SK_ID_PREV')
es = es.add_relationship(parent_dataframe_name='previous', 
                                          parent_column_name='SK_ID_PREV', 
                                          child_dataframe_name='installments', 
                                          child_column_name='SK_ID_PREV')
es = es.add_relationship(parent_dataframe_name='previous', 
                                    parent_column_name='SK_ID_PREV', 
                                    child_dataframe_name='credit', 
                                    child_column_name='SK_ID_PREV')

In [17]:
feature_names= ft.dfs(entityset = es, target_dataframe_name = 'app', 
                                           trans_primitives = ["day", "year", "month", "weekday", "haversine", "num_words", "num_characters"], 
                                           agg_primitives = [ "sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"] ,
                                            max_depth = 2,
                                            ignore_columns = {'app': ["TARGET"]},
                                           features_only = True, verbose = True)

Built 2080 features


In [18]:
ft.save_features(feature_names, './input/features.txt')