In [2]:
import pandas as pd
from cross_validate_survival import cross_validate_survival_model
import utilities.preprocess_raw_data as prep_data
import utilities.generate_survival_dataset as gen_surv
import utilities.train_survival_model as train_surv

In [3]:
raw_merged_df = pd.read_parquet("./data/my_datasets/raw_merged.parquet")

In [4]:
base_df = prep_data.load_base_dataset(raw_merged_df=raw_merged_df)

In [5]:
surv_df = gen_surv.base_to_survival_dataset(base_df)

In [6]:
features = surv_df.columns.drop(["device_uuid", "cycle_id", "event", "duration"])
features = list(set(features).intersection(set(surv_df.columns)))
event = surv_df["event"].astype(bool)
sorted(features)

['batt_max',
 'batt_min',
 'battery_type_id',
 'device_model_code',
 'fw_version_v01.49',
 'fw_version_v01.66',
 'fw_version_v01.70',
 'radio_median',
 'radio_std',
 'temp_median',
 'temp_std']

In [7]:
for feature in features:
    var_events = surv_df.loc[event, feature].var()
    var_cens = surv_df.loc[~event, feature].var()
    print(f"{feature}: {var_events}; {var_cens}")
    if var_events <= 0.15 or var_cens <= 0.15:
        print(f"    Feature {feature} will be removed from features.\n")


fw_version_v01.49: 0.07510204081632653; 0.09801980198019804
    Feature fw_version_v01.49 will be removed from features.

radio_median: 302.2535659367913; 230.8814151550678
fw_version_v01.70: 0.0; 0.1663366336633664
    Feature fw_version_v01.70 will be removed from features.

battery_type_id: 0.12285714285714282; 0.2524752475247524
    Feature battery_type_id will be removed from features.

temp_median: 48.49760564686881; 108.64026874824121
batt_max: 129.49039412172476; 46.36699420476878
fw_version_v01.66: 0.19632653061224492; 0.24831683168316837
device_model_code: 1.6608163265306122; 1.0748514851485147
temp_std: 2.442854364117573; 9.852154720072866
batt_min: 8.182721010600007; 168.63800660382037
radio_std: 57.21727105475869; 62.314957013824305


In [8]:
device_uuids = surv_df["device_uuid"].unique()

In [9]:
print(len(device_uuids))

75


In [10]:
import random
import plotly.express as px

In [13]:
device_uuid_subset = random.sample(list(device_uuids), 40)
surv_df_subset = surv_df[surv_df["device_uuid"].isin(device_uuid_subset)]
event = surv_df_subset["event"].astype(bool)

for feature in features:
    var_events = surv_df_subset.loc[event, feature].var()
    var_cens = surv_df_subset.loc[~event, feature].var()
    print(f"{feature}: {var_events}; {var_cens}")
    if surv_df_subset[feature].nunique() < 5:
        feature_values = surv_df_subset[[feature, "event"]]
        fig = px.histogram(
            feature_values,
            x=feature,
            color="event",
            barmode="group",
            histfunc="count",
            title=f"Feature Werthäufigkeit für {feature}<br>Varianz Event: {var_events:.2f}<br>Varianz Zensiert: {var_cens:.2f}",
            labels={"event": "Ereignis"},
            width=600,
            height=800,
        )
        fig.update_yaxes(title="Anzahl")
        fig.show()
    if var_events <= 0.15 or var_cens <= 0.15:
        print(f"    Feature {feature} will be removed from features.\n")

fw_version_v01.49: 0.0; 0.10407239819004523


    Feature fw_version_v01.49 will be removed from features.

radio_median: 413.6687867885165; 315.9475833435671
fw_version_v01.70: 0.0; 0.18099547511312222


    Feature fw_version_v01.70 will be removed from features.

battery_type_id: 0.15669515669515677; 0.2515082956259427


temp_median: 40.73972356607429; 113.50867353303498
batt_max: 200.0428975281248; 37.04001379821957
fw_version_v01.66: 0.13105413105413105; 0.2515082956259426


    Feature fw_version_v01.66 will be removed from features.

device_model_code: 1.9857549857549857; 1.0931372549019607


temp_std: 2.106408606569279; 11.124080103051613
batt_min: 9.752334982241285; 164.9051236130469
radio_std: 54.43334493358513; 70.76924033606018
