# Defining Entities and Establishing Relationships

In [1]:
# import packages
import pandas as pd

In [2]:
#url path
url_path = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter17/Datasets/bank-full.csv'

In [3]:
# load the data
bankData = pd.read_csv(url_path, sep=';')
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
# The y target variable is not required for creating features, we will remove it
Y = bankData.pop('y')

In [5]:
# Create an ID for the Demographic entity
bankData['custID'] = bankData.index.values
bankData['custID'] = 'cust' + bankData['custID'].astype('str')

In [6]:
# Create an ID for Assets
bankData['AssetID'] = 0
bankData.loc[bankData.housing == 'yes', 'AssetID'] = 1

In [7]:
# create an ID for Loans
bankData['loanID'] = 0
bankData.loc[bankData.loan == 'yes', 'loanID'] = 1

In [8]:
# Create an ID for Financial Behavior
bankData['FinbehID'] = 0
bankData.loc[bankData.default == 'yes', 'FinbehID'] = 1

In [9]:
# display the data frame
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,custID,AssetID,loanID,FinbehID
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,cust0,1,0,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,cust1,1,0,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,cust2,1,1,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,cust3,1,0,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,cust4,0,0,0


In [10]:
# import packages
import featuretools as ft
import numpy as np

In [11]:
# Initialize Entityset
Bankentities = ft.EntitySet(id='Bank')

In [12]:
# Map the data frame to the entity set to create the parent entity
Bankentities.entity_from_dataframe(entity_id='Demographic Data',
                                  dataframe=bankData,
                                  index='custID')

Entityset: Bank
  Entities:
    Demographic Data [Rows: 45211, Columns: 20]
  Relationships:
    No relationships

In [13]:
# Define the Assets entity and set the relationship
Bankentities.normalize_entity(base_entity_id='Demographic Data',
                              new_entity_id='Assets',
                              index = 'AssetID',
                              additional_variables = ['housing'])

Entityset: Bank
  Entities:
    Demographic Data [Rows: 45211, Columns: 19]
    Assets [Rows: 2, Columns: 2]
  Relationships:
    Demographic Data.AssetID -> Assets.AssetID

In [14]:
# map the loan and financial behavior entities
Bankentities.normalize_entity(base_entity_id='Demographic Data',
                              new_entity_id='Liability',
                              index = 'loanID',
                              additional_variables = ['loan'])

Bankentities.normalize_entity(base_entity_id='Demographic Data',
                              new_entity_id='FinBehaviour',
                              index = 'FinbehID',
                              additional_variables = ['default'])

Entityset: Bank
  Entities:
    Demographic Data [Rows: 45211, Columns: 17]
    Assets [Rows: 2, Columns: 2]
    Liability [Rows: 2, Columns: 2]
    FinBehaviour [Rows: 2, Columns: 2]
  Relationships:
    Demographic Data.AssetID -> Assets.AssetID
    Demographic Data.loanID -> Liability.loanID
    Demographic Data.FinbehID -> FinBehaviour.FinbehID

# Creating New Features Using Deep Feature Synthesis

In [15]:
# create automated features using DFS
feature_set, feature_names = ft.dfs(entityset=Bankentities,
                                   target_entity='Demographic Data',
                                   max_depth=2,
                                   verbose=1,
                                   n_jobs=1)

Built 196 features
Elapsed: 00:04 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


In [16]:
# Reset the index of the features
feature_set = feature_set.reindex(index=bankData['custID'])
feature_set = feature_set.reset_index()

In [17]:
feature_set.head()

Unnamed: 0,custID,age,job,marital,education,balance,contact,day,month,duration,...,FinBehaviour.STD(Demographic Data.duration),FinBehaviour.STD(Demographic Data.pdays),FinBehaviour.STD(Demographic Data.previous),FinBehaviour.SUM(Demographic Data.age),FinBehaviour.SUM(Demographic Data.balance),FinBehaviour.SUM(Demographic Data.campaign),FinBehaviour.SUM(Demographic Data.day),FinBehaviour.SUM(Demographic Data.duration),FinBehaviour.SUM(Demographic Data.pdays),FinBehaviour.SUM(Demographic Data.previous)
0,cust0,58,management,married,tertiary,2143,unknown,5,may,261,...,258.239396,100.50463,2.313596,1818546,61701846,122390,701270,11476932,1802679,26018
1,cust1,44,technician,single,secondary,29,unknown,5,may,151,...,258.239396,100.50463,2.313596,1818546,61701846,122390,701270,11476932,1802679,26018
2,cust2,33,entrepreneur,married,secondary,2,unknown,5,may,76,...,258.239396,100.50463,2.313596,1818546,61701846,122390,701270,11476932,1802679,26018
3,cust3,47,blue-collar,married,unknown,1506,unknown,5,may,92,...,258.239396,100.50463,2.313596,1818546,61701846,122390,701270,11476932,1802679,26018
4,cust4,33,unknown,single,unknown,1,unknown,5,may,198,...,258.239396,100.50463,2.313596,1818546,61701846,122390,701270,11476932,1802679,26018


In [18]:
# define the set of aggregation and transformation primitives
aggPrimitives=['std', 'min', 'max', 'mean', 'last', 'count']
tranPrimitives=['percentile', 'subtract_numeric', 'divide_numeric']

In [19]:
# Create a new set of features with the custom primitive list
feature_set, feature_names = ft.dfs(entityset=Bankentities,
                                    target_entity='Demographic Data',
                                    agg_primitives=aggPrimitives,
                                    trans_primitives=tranPrimitives,
                                    max_depth = 2,
                                    verbose =  1,
                                    n_jobs = 1)

Built 224 features
Elapsed: 00:04 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


In [20]:
feature_set.head()

Unnamed: 0_level_0,age,job,marital,education,balance,contact,day,month,duration,campaign,...,FinBehaviour.MIN(Demographic Data.duration),FinBehaviour.MIN(Demographic Data.pdays),FinBehaviour.MIN(Demographic Data.previous),FinBehaviour.STD(Demographic Data.age),FinBehaviour.STD(Demographic Data.balance),FinBehaviour.STD(Demographic Data.campaign),FinBehaviour.STD(Demographic Data.day),FinBehaviour.STD(Demographic Data.duration),FinBehaviour.STD(Demographic Data.pdays),FinBehaviour.STD(Demographic Data.previous)
custID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cust0,58,management,married,tertiary,2143,unknown,5,may,261,1,...,0,-1,0,10.638882,3063.275425,3.087038,8.317696,258.239396,100.50463,2.313596
cust1,44,technician,single,secondary,29,unknown,5,may,151,1,...,0,-1,0,10.638882,3063.275425,3.087038,8.317696,258.239396,100.50463,2.313596
cust2,33,entrepreneur,married,secondary,2,unknown,5,may,76,1,...,0,-1,0,10.638882,3063.275425,3.087038,8.317696,258.239396,100.50463,2.313596
cust3,47,blue-collar,married,unknown,1506,unknown,5,may,92,1,...,0,-1,0,10.638882,3063.275425,3.087038,8.317696,258.239396,100.50463,2.313596
cust4,33,unknown,single,unknown,1,unknown,5,may,198,1,...,0,-1,0,10.638882,3063.275425,3.087038,8.317696,258.239396,100.50463,2.313596


# Classification Model After Automated Feature Generation

In [21]:
# split the dataset into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(bankData, Y, test_size=0.3, random_state=123)

In [22]:
# use pipelines to transform the variables
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

cat_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
num_transformer = Pipeline(steps=[('scaler', StandardScaler())])

In [23]:
# define the categorical and numerical data types
num_features = bankData.select_dtypes(include=['int64', 'float64']).columns
cat_features = bankData.select_dtypes(include=['object']).columns

In [24]:
# create the preprocessor pipeline using the ColumnTransformer() function
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num_features),
                                              ('cat', cat_transformer, cat_features)])

In [25]:
# create the estimator that contains the preprocessor and logistic regression classifier
from sklearn.linear_model import LogisticRegression

estimator = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression(random_state=123))])

In [26]:
# fit the estimator
estimator.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous',
       'AssetID', 'loanID', 'FinbehID'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome', 'custID'],
      dtype='object'))])),
                ('classifier', LogisticRegression(random_state=123))])

In [27]:
# calculate accuracy score
print(f'Accuracy score of Logistic Regression Model: {estimator.score(X_test, y_test)}')

Accuracy score of Logistic Regression Model: 0.8998083161309348


In [28]:
# make predictions
pred = estimator.predict(X_test)

In [29]:
# classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          no       0.92      0.98      0.95     11998
         yes       0.63      0.32      0.42      1566

    accuracy                           0.90     13564
   macro avg       0.77      0.65      0.68     13564
weighted avg       0.88      0.90      0.88     13564



Once the benchmark model has been created without feature engineering, we will proceed and create some new features using Featuretools and then fit another model on the new set of features. Now, we'll proceed by defining the entities and their relationships.

In [30]:
# Create the customer ID
bankData['custID'] = bankData.index.values
bankData['custID'] = 'cust' + bankData['custID'].astype('str')

In [31]:
# create the ID for Assets
bankData['AssetID'] = 0
bankData.loc[bankData.housing == 'yes', 'AssetID'] = 1

In [32]:
# create LoanId
bankData['loanID'] = 0
bankData.loc[bankData.loan == 'yes', 'loanID'] = 1

In [33]:
# create the ID for Financial Behavior
bankData['FinbehID'] = 0
bankData.loc[bankData.default == 'yes', 'FinbehID'] = 1

In [34]:
# import packages
import featuretools as ft
import numpy as np

In [35]:
#  initialize the entity set
Bankentities = ft.EntitySet(id = 'Bank')

In [36]:
# Map the parent entity to the data frame
Bankentities.entity_from_dataframe(entity_id = 'Demographic Data',
                                   dataframe = bankData,
                                   index = 'custID')

Entityset: Bank
  Entities:
    Demographic Data [Rows: 45211, Columns: 20]
  Relationships:
    No relationships

In [37]:
# Map all the entities and set their relationships
Bankentities.normalize_entity(base_entity_id='Demographic Data',
                              new_entity_id='Assets',
                              index = 'AssetID',
                              additional_variables = ['housing'])

Bankentities.normalize_entity(base_entity_id='Demographic Data',
                              new_entity_id='Liability',
                              index = 'loanID',
                              additional_variables = ['loan'])

Bankentities.normalize_entity(base_entity_id='Demographic Data',
                              new_entity_id='FinBehaviour',
                              index = 'FinbehID',
                              additional_variables = ['default'])

Entityset: Bank
  Entities:
    Demographic Data [Rows: 45211, Columns: 17]
    Assets [Rows: 2, Columns: 2]
    Liability [Rows: 2, Columns: 2]
    FinBehaviour [Rows: 2, Columns: 2]
  Relationships:
    Demographic Data.AssetID -> Assets.AssetID
    Demographic Data.loanID -> Liability.loanID
    Demographic Data.FinbehID -> FinBehaviour.FinbehID

In [38]:
# create aggregation and transformation primitives
aggPrimitives=['std', 'min', 'max', 'mean', 'last', 'count']
tranPrimitives=['percentile', 'subtract_numeric', 'divide_numeric']

In [39]:
# define the DFS with the created primitives
feature_set, feature_names = ft.dfs(entityset=Bankentities,
                                    target_entity='Demographic Data',
                                    agg_primitives=aggPrimitives,
                                    trans_primitives=tranPrimitives,
                                    max_depth = 2,
                                    verbose = 1,
                                    n_jobs = 1)

Built 224 features
Elapsed: 00:04 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


In [40]:
# reindex feature_set so that the index is similar to the original dataset
feature_set = feature_set.reindex(index=bankData['custID'])
feature_set = feature_set.reset_index()

In [41]:
# drop all the IDs
X = feature_set[feature_set.columns[~feature_set.columns.str.contains('custID|AssetID|loanID|FinbehID')]]

In [42]:
# Replace all infinity values with nan values
X = X.replace([np.inf, -np.inf], np.nan)

In [43]:
# Drop all the columns containing nan
X = X.dropna(axis=1, how='any')

In [44]:
# split the new dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)

In [45]:
# create the processing pipeline
categorical_transformer  = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                               ('cat', categorical_transformer, categorical_features)])

In [46]:
# create the estimator function
estimator = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', LogisticRegression(random_state=123))])
estimator.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous',
       'age / campaign', 'age / day', 'age / pdays',
       ...
       'FinBehaviour.MIN(Demographic Data.duration)',
       'FinBehaviour.MIN(Demographic Data.pdays)',
       'FinBehaviour.MIN(Demographic Data.pr...
       'Liability.LAST(Demographic Data.poutcome)',
       'FinBehaviour.LAST(Demographic Data.contact)',
       'FinBehaviour.LAST(Demographic Data.education)',
       'FinBehaviour.LAST(Demographic Data.job)',
       'FinBehaviour.LAST(Demographic Data.marital)',
       'FinBehaviour.LAST(Demographic Data.month)',
       'FinBehaviour.LAST(Demographic Data.poutcome)'],
      dtype='object'))])),


In [47]:
print(f"model score: {estimator.score(X_test, y_test)}")

model score: 0.9027572987319374


In [48]:
# make predictions
pred = estimator.predict(X_test)

In [49]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          no       0.92      0.97      0.95     11998
         yes       0.65      0.35      0.45      1566

    accuracy                           0.90     13564
   macro avg       0.78      0.66      0.70     13564
weighted avg       0.89      0.90      0.89     13564



From the preceding output, we can see that the accuracy scores have remained the same. However, there is improvement in the precision, recall, and f1-score of the minority class (yes). All of these values have increased from 34%, 64%, and 43%, respectively. We should remember that this is an extremely unbalanced dataset. However, with the new features we generated, we were able to show marginal improvement in the performance of the minority class.