In [3]:
#https://www.kaggle.com/willkoehrsen/automated-feature-engineering-basics
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# featuretools for automated feature engineering
import featuretools as ft

# matplotlit and seaborn for visualizations
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 22
import seaborn as sns

# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')

In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


In [17]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,set
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,,,0,2,2008,WD,Normal,208500,train
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,181500,train
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,,,0,9,2008,WD,Normal,223500,train
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,,,0,2,2006,WD,Abnorml,140000,train
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,,,0,12,2008,WD,Normal,250000,train


In [9]:
#Tilføjer en kolonne ti lat separare træningsset og testset.
#Laver target variable og indsætter NAN i testsæt
train['set'] = 'train'
test['set'] = 'test'
test["SalePrice"] = np.nan

In [10]:
app = train.append(test, ignore_index = True)

In [22]:
es = ft.EntitySet(id = 'Id')

In [40]:
es = es.entity_from_dataframe(entity_id = 'app', dataframe = app, index = 'Id')

In [26]:
# List the primitives in a dataframe
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 100
primitives[primitives['type'] == 'aggregation'].head(10)

Unnamed: 0,name,type,description
0,time_since_last,aggregation,Time since last related instance.
1,sum,aggregation,Counts the number of elements of a numeric or boolean feature.
2,avg_time_between,aggregation,Computes the average time between consecutive events.
3,all,aggregation,Test if all values are 'True'.
4,count,aggregation,Counts the number of non null values.
5,last,aggregation,Returns the last value.
6,num_true,aggregation,Finds the number of 'True' values in a boolean.
7,trend,aggregation,Calculates the slope of the linear trend of variable overtime.
8,any,aggregation,Test if any value is 'True'.
9,max,aggregation,Finds the maximum non-null value of a numeric feature.


In [27]:
primitives[primitives['type'] == 'transform'].head(10)

Unnamed: 0,name,type,description
19,not,transform,"For each value of the base feature, negates the boolean value."
20,numwords,transform,Returns the words in a given string by counting the spaces.
21,and,transform,"For two boolean values, determine if both values are 'True'."
22,year,transform,Transform a Datetime feature into the year.
23,month,transform,Transform a Datetime feature into the month.
24,days,transform,Transform a Timedelta feature into the number of days.
25,cum_min,transform,Calculates the min of previous values of an instance for each value in a time-dependent entity.
26,minutes,transform,Transform a Timedelta feature into the number of minutes.
27,weekend,transform,Transform Datetime feature into the boolean of Weekend.
28,percentile,transform,"For each value of the base feature, determines the percentile in relation"


In [37]:
default_agg_primitives =  ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"]
default_trans_primitives =  ["day", "year", "month", "weekday", "haversine", "numwords", "characters"]

# DFS with specified primitives
feature_matrix, feature_names = ft.dfs(entityset = es, target_entity = 'app',
                                       trans_primitives = default_trans_primitives,
                                       agg_primitives=default_agg_primitives, 
                                        max_depth = 3, features_only=False, verbose = True)

print('%d Total Features' % len(feature_names))

Built 81 features
Elapsed: 00:00 | Remaining: 00:00 | Progress: 100%|██████████████████████████████████████████| Calculated: 11/11 chunks
81 Total Features


In [38]:
feature_matrix.head()

Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,set
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,856,854,0,,3,1Fam,TA,No,706.0,0.0,...,0,Pave,8,856.0,AllPub,0,2003,2003,2008,train
2,1262,0,0,,3,1Fam,TA,Gd,978.0,0.0,...,0,Pave,6,1262.0,AllPub,298,1976,1976,2007,train
3,920,866,0,,3,1Fam,TA,Mn,486.0,0.0,...,0,Pave,6,920.0,AllPub,0,2001,2002,2008,train
4,961,756,0,,3,1Fam,Gd,No,216.0,0.0,...,0,Pave,7,756.0,AllPub,0,1915,1970,2006,train
5,1145,1053,0,,4,1Fam,TA,Av,655.0,0.0,...,0,Pave,9,1145.0,AllPub,192,2000,2000,2008,train


In [36]:
feature_matrix_spec, feature_names_spec = ft.dfs(entityset = es, target_entity = 'app',  
                                                 agg_primitives = ['sum', 'count', 'min', 'max', 'mean', 'mode'], 
                                                 max_depth = 2, features_only = False, verbose = True)

Built 81 features
Elapsed: 00:00 | Remaining: 00:00 | Progress: 100%|██████████████████████████████████████████| Calculated: 11/11 chunks
