In [1]:
import pandas as pd
import numpy as np
import featuretools as ft
import gc

TRANSACTION_LOAD_DTYPES = {
    'authorized_flag': 'object',
    'card_id': 'object',
    'city_id': 'int64',
    'category_1': 'category',
    'installments': 'int64',
    'category_3': 'category',
    'merchant_category_id': 'int64',
    'merchant_id': 'object',
    'month_lag': 'int64',
    'purchase_amount': 'float64',
    'category_2': 'category',
    'state_id': 'int64',
    'subsector_id': 'int64'
}



CARD_TYPES = {
    'feature_1': ft.variable_types.Categorical,
    'feature_2': ft.variable_types.Categorical,
    'feature_3': ft.variable_types.Categorical
}

TRANSACTION_TYPES = {
    'authorized_flag': ft.variable_types.Numeric,
    'category_1': ft.variable_types.Categorical,
    'category_2': ft.variable_types.Categorical,
    'category_3': ft.variable_types.Categorical,
    'installments': ft.variable_types.Numeric,
    'merchant_category_id': ft.variable_types.Categorical,
    'month_lag': ft.variable_types.Numeric,
    'purchase_amount': ft.variable_types.Numeric,
    'state_id': ft.variable_types.Categorical,
    'subsector_id': ft.variable_types.Categorical
}

In [3]:
customer_df = pd.read_csv("../data/raw/train.csv")
customer_df['first_active_month'] = pd.to_datetime(customer_df['first_active_month'] + "-01")

print("Reading in transactions")
transactions_df = pd.read_csv("../data/raw/historical_transactions.csv", dtype=TRANSACTION_LOAD_DTYPES)
transactions_df['authorized_flag'] = np.where(transactions_df['authorized_flag'] == 'Y', 1, 0)
transactions_df.reset_index(inplace=True)

print("Creating training entity set")
es = ft.EntitySet()
es = es.entity_from_dataframe(
    entity_id='customer', 
    dataframe=customer_df,
    index='card_id',
    time_index='first_active_month',
    variable_types=CARD_TYPES
)

es = es.entity_from_dataframe(
    entity_id='transactions',
    dataframe=transactions_df,
    index='index',
    variable_types=TRANSACTION_TYPES
)

del customer_df, transactions_df
gc.collect()

print("Defining relationships")
relationship = ft.Relationship(es['customer']['card_id'],
                                es['transactions']['card_id'])

es = es.add_relationship(relationship)

feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='customer')

Reading in transactions
Creating training entity set


NameError: name 'gc' is not defined