-
Notifications
You must be signed in to change notification settings - Fork 38
/
featuretools.py
147 lines (115 loc) · 4.73 KB
/
featuretools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# -*- coding: utf-8 -*-
import featuretools as ft
from featuretools.selection import remove_low_information_features
class DFS(object):
features = None
def __init__(self, max_depth=None, encode=True, remove_low_information=True,
target_entity=None, index=None, time_index=None,
agg_primitives=None, trans_primitives=None, copy=True):
self.copy = copy
self.max_depth = max_depth
self.encode = encode
self.remove_low_information = remove_low_information
self.target_entity = target_entity
self.index = index
self.time_index = time_index
self.agg_primitives = agg_primitives
self.trans_primitives = trans_primitives
def __repr__(self):
return (
"DFS(max_depth={max_depth},\n"
" encode={encode},\n"
" remove_low_information={remove_low_information},\n"
" target_entity={target_entity},\n"
" index={index},\n"
" time_index={time_index},\n"
" agg_primitives={agg_primitives},\n"
" trans_primitives={trans_primitives})"
).format(**self.__dict__)
def _get_index(self, X):
if self.copy:
X = X.copy()
index = X.index.name or 'index'
while index in X.columns:
index = '_' + index
X.index.name = index
X.reset_index(inplace=True)
return X, index
def _get_entityset(self, X, target_entity, entities, relationships):
if entities is None:
X, index = self._get_index(X)
entities = {
'X': (X, index)
}
if relationships is None:
relationships = []
return ft.EntitySet('entityset', entities, relationships)
def dfs(self, X=None, target_entity=None, entityset=None, entities=None, relationships=None):
if not entities and not entityset:
target_entity = 'X'
else:
target_entity = target_entity or self.target_entity
if entityset is None:
entityset = self._get_entityset(X, target_entity, entities, relationships)
instance_ids = None
cutoff_time = None
if self.time_index:
cutoff_time = X[[self.index, self.time_index]]
elif self.index:
instance_ids = X[self.index]
else:
instance_ids = X.index.values
self.features = ft.dfs(
cutoff_time=cutoff_time,
instance_ids=instance_ids,
max_depth=self.max_depth,
entityset=entityset,
target_entity=target_entity,
features_only=True,
agg_primitives=self.agg_primitives,
trans_primitives=self.trans_primitives
)
X = ft.calculate_feature_matrix(
self.features,
entityset=entityset,
cutoff_time=cutoff_time,
instance_ids=instance_ids,
)
if self.encode:
X, self.features = ft.encode_features(X, self.features)
if self.remove_low_information:
X, self.features = remove_low_information_features(X, self.features)
def calculate_feature_matrix(self, X, target_entity=None, entityset=None,
entities=None, relationships=None):
if entityset is None:
entityset = self._get_entityset(X, target_entity, entities, relationships)
instance_ids = None
cutoff_time = None
if self.time_index:
cutoff_time = X[[self.index, self.time_index]]
elif self.index:
instance_ids = X[self.index]
else:
instance_ids = X.index.values
X = ft.calculate_feature_matrix(
self.features,
entityset=entityset,
cutoff_time=cutoff_time,
instance_ids=instance_ids,
)
return X
def entity_from_dataframe(entityset, entityset_id, entity_id, dataframe, index=None,
variable_types=None, make_index=False, time_index=None,
secondary_time_index=None, already_sorted=False):
if entityset is None:
entityset = ft.EntitySet(entityset_id)
entityset.entity_from_dataframe(entity_id, dataframe.copy(), index, variable_types,
make_index, time_index, secondary_time_index,
already_sorted)
return entityset
def add_relationship(entityset, parent, parent_column, child, child_column):
parent_variable = entityset[parent][parent_column]
child_variable = entityset[child][child_column]
relationship = ft.Relationship(parent_variable, child_variable)
entityset.add_relationship(relationship)
return entityset