/
multitable.py
94 lines (76 loc) · 3.06 KB
/
multitable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""Usage example for Multitable pipeline on a Retail Dataset."""
import featuretools as ft
import pandas as pd
from sklearn.metrics import f1_score
from mlblocks.ml_pipeline.ml_pipeline import MLPipeline
def make_entity_set(orders_table, order_products_table):
es = ft.EntitySet("instacart")
es.entity_from_dataframe(
entity_id="order_products",
dataframe=order_products_table,
index="order_product_id",
variable_types={
"aisle_id": ft.variable_types.Categorical,
"reordered": ft.variable_types.Boolean
},
time_index="order_time")
es.entity_from_dataframe(
entity_id="orders",
dataframe=orders_table,
index="order_id",
time_index="order_time")
es.add_relationship(
ft.Relationship(es["orders"]["order_id"],
es["order_products"]["order_id"]))
es.normalize_entity(
base_entity_id="orders", new_entity_id="users", index="user_id")
es.add_last_time_indexes()
es["order_products"]["department"].interesting_values = [
'produce', 'dairy eggs', 'snacks', 'beverages', 'frozen', 'pantry',
'bakery', 'canned goods', 'deli', 'dry goods pasta'
]
es["order_products"]["product_name"].interesting_values = [
'Banana', 'Bag of Organic Bananas', 'Organic Baby Spinach',
'Organic Strawberries', 'Organic Hass Avocado', 'Organic Avocado',
'Large Lemon', 'Limes', 'Strawberries', 'Organic Whole Milk'
]
return es
if __name__ == "__main__":
print("""
============================================
Testing Multi Table Pipeline
============================================
""")
orders = pd.read_csv("data/Retail/orders.csv")
order_products = pd.read_csv("data/Retail/order_products.csv")
label_times = pd.read_csv("data/Retail/label_times.csv")
X = label_times.sample(frac=0.8)
X_test = label_times.drop(X.index)
y = X["label"]
y_test = X_test["label"]
es = make_entity_set(orders, order_products)
multitable = MLPipeline.from_ml_json(['dfs', 'random_forest_classifier'])
# Check that the hyperparameters are correct.
for hyperparam in multitable.get_tunable_hyperparams():
print(hyperparam)
# Check that the steps are correct.
expected_steps = {'dfs', 'rf_classifier'}
steps = set(multitable.steps_dict.keys())
assert expected_steps == steps
# Check that we can score properly.
produce_params = {
('dfs', 'entityset'): es,
('dfs', 'cutoff_time_in_index'): True
}
print("\nFitting pipeline...")
fit_params = {
('dfs', 'entityset'): es,
('dfs', 'target_entity'): "users",
('dfs', 'training_window'): ft.Timedelta("60 days")
}
multitable.fit(X, y, fit_params=fit_params, produce_params=produce_params)
print("\nFit pipeline.")
print("\nScoring pipeline...")
predicted_y_val = multitable.predict(X_test, predict_params=produce_params)
score = f1_score(predicted_y_val, y_test, average='micro')
print("\nf1 micro score: %f" % score)