In [0]:
import fifeforspark
from fifeforspark.utils import create_example_data2
from fifeforspark.processors import PanelDataProcessor
from fifeforspark.lgb_modelers import LGBSurvivalModeler


In [0]:
# Import the data
data = fifeforspark.utils.create_example_data()
test_intervals = 4

# Pass data through Panel Data Processor. We set test intervals to 4 to create the ability to test the model
# We set shuffle parts to 20 to reduce the amount of overhead since the dataset isn't that large.

processor = PanelDataProcessor(data=data, config = {'TEST_INTERVALS': test_intervals}, shuffle_parts = 20)
processor.build_processed_data()

  Expected bytes, got a 'int' object
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
Time identifier column name not given; assumed to be second-leftmost column (period)
Individual identifier column name not given; assumed to be leftmost column (individual)
Out[2]: DataFrame[individual: int, period: int, feature_1: float, feature_2: string, feature_3: float, feature_4: string, feature_5: string, _period: int, _predict_obs: boolean, _test: boolean, _validation: boolean, _maximum_lead: int, _spell: bigint, _duration: bigint, _event_observed: boolean]

In [0]:
# Now, we build the model. You can pass parameters into the model that will be passed to lightgbm as well.
modeler = LGBSurvivalModeler(data=processor.data)
modeler.build_model(n_intervals=test_intervals)


  0%|          | 0/4 [00:00<?, ?it/s]Training models. Currently training model for time horizon 0:   0%|          | 0/4 [00:00<?, ?it/s]Training models. Currently training model for time horizon 0:  25%|██▌       | 1/4 [00:30<01:30, 30.30s/it]Training models. Currently training model for time horizon 1:  25%|██▌       | 1/4 [00:30<01:30, 30.30s/it]Training models. Currently training model for time horizon 1:  50%|█████     | 2/4 [00:59<00:59, 29.67s/it]Training models. Currently training model for time horizon 2:  50%|█████     | 2/4 [00:59<00:59, 29.67s/it]Training models. Currently training model for time horizon 2:  75%|███████▌  | 3/4 [01:22<00:26, 26.63s/it]Training models. Currently training model for time horizon 3:  75%|███████▌  | 3/4 [01:22<00:26, 26.63s/it]Training models. Currently training model for time horizon 3: 100%|██████████| 4/4 [01:47<00:00, 25.98s/it]Training models. Currently training model for time horizon 3: 100%|██████████| 4/4 [01:47<00:00, 26.88s/i

In [0]:
# This part is unnecessary as it's equivalent to the default subset; however, we do it to show the ability to pass in a subset. 
min_val = modeler.data.select(modeler.data['_period']).agg({'_period': 'min'}).first()[0]

# We now predict the survival probabilities for our model.
evaluation_subset = modeler.data.select((modeler.data['_period'] == min_val).alias("subset")).select('subset')
predictions = modeler.predict(subset = evaluation_subset)
predictions.show()

+-------------+-------------+-------------+-------------+
|probability_1|probability_2|probability_3|probability_4|
+-------------+-------------+-------------+-------------+
|          1.0|    0.9999996|    0.9999995|    0.9999991|
|          1.0|   0.99999994|    0.9999997|    0.9999955|
|    0.9999537|    0.9750392|   0.14177465|   0.08792964|
|          1.0|    0.9999998|   0.99999905|   0.99999803|
|    0.9999999|    0.9999853|   0.99997985|    0.9997495|
|          1.0|    0.9999999|    0.9999999|    0.9999999|
|          1.0|   0.99999624|   0.99988943|    0.9903598|
|          1.0|          1.0|          1.0|          1.0|
|          1.0|          1.0|   0.99999994|    0.9999983|
|          1.0|    0.9999981|   0.99995315|    0.9920869|
|    0.9999982|   0.99999374|    0.9999896|    0.9909796|
|          1.0|          1.0|   0.99999994|    0.9999984|
|   0.99999994|    0.9999996|    0.9999986|     0.999402|
|   0.99982005|    0.7238621|  0.089254715|  0.035922796|
|          1.0

In [0]:
# We want to see how we did, so we print some evaluation metrics
evaluation = modeler.evaluate()
evaluation

Evaluating Model by Lead Length:   0%|          | 0/4 [00:00<?, ?it/s]Evaluating Model by Lead Length:  25%|██▌       | 1/4 [00:29<01:27, 29.24s/it]Evaluating Model by Lead Length:  50%|█████     | 2/4 [00:59<00:59, 29.94s/it]Evaluating Model by Lead Length:  75%|███████▌  | 3/4 [01:31<00:30, 30.91s/it]Evaluating Model by Lead Length: 100%|██████████| 4/4 [02:06<00:00, 32.61s/it]Evaluating Model by Lead Length: 100%|██████████| 4/4 [02:06<00:00, 31.74s/it]


Unnamed: 0_level_0,AUROC,Predicted Share,Actual Share,True Positives,False Negatives,False Positives,True Negatives
Lead Length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.978588,0.9118,0.901035,3478,91,129,263
2,0.97802,0.816238,0.800555,3061,110,171,619
3,0.97865,0.727727,0.713961,2693,135,169,964
4,0.974741,0.647812,0.630144,2355,141,224,1241


In [0]:
# And finally, we forecast out from the last period of data.
forecasts = modeler.forecast()
forecasts

Unnamed: 0,1-period Survival Probability,2-period Survival Probability,3-period Survival Probability,4-period Survival Probability
0,1.0,0.999995,0.999993,0.9999875
1,0.999999,0.999992,0.999405,0.9846355
2,1.0,0.999859,0.908467,0.4134963
3,1.0,0.999941,0.996179,0.8705915
4,1.0,0.99995,0.998703,0.8723884
5,1.0,0.999921,0.999309,0.9924348
6,0.252194,0.002203,0.000735,0.0001698887
7,1.0,1.0,1.0,0.9999992
8,0.999598,0.991504,0.971303,0.607249
9,0.999973,0.978368,0.751363,0.3636699
