# Building a Classification Model with Features that have been Generated Using Featuretools

<b> Read the data </b>

In [1]:
import pandas as pd

In [2]:
url_path = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter17/Datasets/adult.csv'

In [3]:
df = pd.read_csv(url_path, na_values='?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours,native,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Female,0,0,40,Cuba,0


In [4]:
df.shape

(32561, 14)

<b> Drop all na values </b>

In [5]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
sex               0
capital-gain      0
capital-loss      0
hours             0
native            0
label             0
dtype: int64

There is no na value

<b> Create the Y variable </b>

In [6]:
Y = df.pop('label')

<b> Split the dataset into train and test sets </b>

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df, Y, test_size=0.3, random_state=123)

<b> Create the processor pipeline to convert categorical variables into one-hot encoding and numerical variables into scaled variables </b>

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [10]:
cat_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
num_transformer = Pipeline(steps=[('scaler', StandardScaler())])

In [11]:
num_features = df.select_dtypes(include=['int64', 'float64']).columns
cat_features = df.select_dtypes(include=['object']).columns

In [12]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num_features),
                                              ('cat', cat_transformer, cat_features)])

<b> Define the estimator function using the data processor and a logistic regression classifier </b>

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
estimator = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression(random_state=123))])

<b> Fit the estimator on the train set and then print the scores on the test set </b>

In [15]:
estimator.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'sex', 'native'],
      dtype='object'))])),
                ('classifier', LogisticRegression(random_state=123))])

In [16]:
print(f'Model score: {estimator.score(X_test, y_test)}')

Model score: 0.8530044016787798


<b> Generate predictions on the test set </b>

In [17]:
pred = estimator.predict(X_test)

<b> Print the classification report </b>

In [18]:
from sklearn.metrics import classification_report

In [19]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91      7408
           1       0.75      0.59      0.66      2361

    accuracy                           0.85      9769
   macro avg       0.81      0.76      0.78      9769
weighted avg       0.85      0.85      0.85      9769



From the preceding output, we can see that the benchmark model has an accuracy of 85%. We would also be interested in the recall values of the different classes. Class 0 has a recall value of 88%, which means that out of 7189 adults who did not earn an income of more than 50,000 per year, 88% were correctly identified. Class 1 has a recall value of 75%, which indicates that 75% of adults who earned more than 50,000 per year were correctly identified.

<b> Create a parent entity ID called parentID </b>

In [20]:
df['parentID'] = df.index.values
df['parentID'] = 'record' + df['parentID'].astype('str')

<b> For the workclass variable, the unique values are as follows: ' Federal-gov', ' Local-gov', ' Private',' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay' </b>

In [21]:
df.loc[df.workclass == ' Federal-gov','workId'] = 1
df.loc[df.workclass == ' Local-gov','workId'] = 2
df.loc[df.workclass == ' Private','workId'] = 3
df.loc[df.workclass == ' Self-emp-inc','workId'] = 4
df.loc[df.workclass == ' Self-emp-not-inc','workId'] = 5
df.loc[df.workclass == ' State-gov','workId'] = 6
df.loc[df.workclass == ' Without-pay','workId'] = 7

<b> For the Occupation variable, the unique values are as follows:' Adm-clerical', ' Armed-Forces',' Craft-repair', ' Exec-managerial', ' Farming-fishing',' Handlers-cleaners', ' Machine-op-inspct', ' Other-service', ' Priv-house-serv', ' Prof-specialty',' Protective-serv',' Sales', ' Tech-support', ' Transport-moving' </b>

In [22]:
df.loc[df.occupation == ' Adm-clerical','occuId']= 1
df.loc[df.occupation == ' Armed-Forces','occuId']= 2
df.loc[df.occupation == ' Craft-repair','occuId']= 3
df.loc[df.occupation == ' Exec-managerial','occuId']= 4
df.loc[df.occupation == ' Farming-fishing','occuId']= 5
df.loc[df.occupation == ' Handlers-cleaners','occuId']= 6
df.loc[df.occupation == ' Machine-op-inspct','occuId']= 7
df.loc[df.occupation == ' Other-service','occuId']= 8
df.loc[df.occupation == ' Priv-house-serv','occuId']= 9
df.loc[df.occupation == ' Prof-specialty','occuId']= 10
df.loc[df.occupation == ' Protective-serv','occuId']= 11
df.loc[df.occupation == ' Sales','occuId']= 12
df.loc[df.occupation == ' Tech-support','occuId']= 13
df.loc[df.occupation == ' Transport-moving','occuId']= 14

<b> Create the parent entity and set the relationship with education, workclass, and occupation using their respective IDs </b>

In [23]:
import featuretools as ft
import numpy as np

In [24]:
adultentities = ft.EntitySet(id = 'Adult')

In [25]:
adultentities.entity_from_dataframe(entity_id = 'Parent Data',
                                    dataframe = df,
                                    index = 'parentID')

Entityset: Adult
  Entities:
    Parent Data [Rows: 32561, Columns: 16]
  Relationships:
    No relationships

In [26]:
adultentities.normalize_entity(base_entity_id='Parent Data',
                               new_entity_id='education',
                               index = 'education-num',
                               additional_variables = ['education'])

adultentities.normalize_entity(base_entity_id='Parent Data',
                               new_entity_id='Workclass',
                               index = 'workId',
                               additional_variables = ['workclass'])

adultentities.normalize_entity(base_entity_id='Parent Data',
                               new_entity_id='Occupation',
                               index = 'occuId',
                               additional_variables = ['occupation'])

Entityset: Adult
  Entities:
    Parent Data [Rows: 32561, Columns: 13]
    education [Rows: 16, Columns: 2]
    Workclass [Rows: 8, Columns: 2]
    Occupation [Rows: 15, Columns: 2]
  Relationships:
    Parent Data.education-num -> education.education-num
    Parent Data.workId -> Workclass.workId
    Parent Data.occuId -> Occupation.occuId

<b> Create the aggregation and transformation primitives </b>

In [27]:
aggPrimitives = ['std', 'min', 'max', 'mean', 'last', 'count']
tranPrimitives=['percentile', 'subtract', 'divide']

<b> Create the DFS with the defined primitives </b>

In [28]:
feature_set, feature_names = ft.dfs(entityset=adultentities,
                                    target_entity = 'Parent Data',
                                    agg_primitives=aggPrimitives,
                                    #trans_primitives=tranPrimitives,
                                    max_depth = 2,
                                    verbose = 1,
                                    n_jobs = 1)

Built 114 features
Elapsed: 00:01 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


<b> Reindex the created data frame </b>

In [29]:
feature_set = feature_set.reindex(index=df['parentID'])
feature_set = feature_set.reset_index()

<b> Drop all the variables related to the IDs </b>

In [30]:
X = feature_set[feature_set.columns[~feature_set.columns.str.contains('parentID|education-num|workId|occuId')]]

<b> Replace all the infinity values with na and the drop columns </b>

In [31]:
X = X.replace([np.inf, -np.inf], np.nan)

In [32]:
X = X.dropna(axis=1, how='any')

<b> Split the dataset into train and test sets </b>

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)

<b> Create the processing pipeline </b>

In [34]:
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

In [35]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [36]:
from sklearn.compose import ColumnTransformer

In [37]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                               ('cat', categorical_transformer, categorical_features)])

<b> Create the estimator function and fit the training set on the estimator. Then, generate the scores </b>

In [38]:
estimator = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', LogisticRegression(random_state=123))])

In [39]:
estimator.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours',
       'education.COUNT(Parent Data)', 'education.LAST(Parent Data.age)',
       'education.LAST(Parent Data.capital-gain)',
       'education.LAST(Parent Data.capital-loss)',
       'education.LAST(Parent Data...
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['marital-status', 'relationship', 'sex', 'native',
       'education.education', 'education.LAST(Parent Data.marital-status)',
       'education.LAST(Parent Data.native)',


In [40]:
print(f'Model score: {estimator.score(X_test, y_test)}')

Model score: 0.8469648889343843


<b> Generate predictions on the test set and print the classification report </b>

In [41]:
pred = estimator.predict(X_test)

In [42]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.90      7408
           1       0.74      0.56      0.64      2361

    accuracy                           0.85      9769
   macro avg       0.81      0.75      0.77      9769
weighted avg       0.84      0.85      0.84      9769



From the preceding output, we can see that the accuracy scores have improved from 85% to 86%. There is also an improvement in the precision, recall, and f1-score of the minority class (yes). All of these values have increased from 62%, 75%, and 68% to 64%, 76%, and 69%, respectively. From a business perspective, the result indicates that out of the total 9,049 adults, 86% of them have been correctly identified as earning more than 50,000 per year or not.