In [None]:
from lifelines import WeibullAFTFitter
from pyspark.sql.session import SparkSession, SparkConf
import pyspark.sql.functions as f
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10,10)
import pandas as pd
import seaborn as sns
import numpy as np
from lifelines.utils import k_fold_cross_validation

In [None]:
df = pd.read_parquet('dados_modelo.parquet')

In [None]:
df = df.dropna()

In [None]:
cnpj = df['cnpj']

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df = pd.get_dummies(df.drop(columns=['cnpj']),drop_first=True)

In [None]:
df['cnpj'] = cnpj

In [None]:
df

In [None]:
df = df[df['idade_anos'] > 0]

In [None]:
aft = WeibullAFTFitter(penalizer=0.01)

In [None]:
df = df.dropna()

In [None]:
df.columns

In [None]:
df[['latitude', 'longitude']]

In [None]:
df['latitude'] = df['latitude'] + 21.558548
df['longitude'] = df['longitude'] + 45.4394151

In [None]:
df.drop(columns=['grande_area_ELETRICIDADE E GAS'], inplace=True)

In [None]:
aft.fit(df, duration_col='idade_anos', event_col='fechada', ancillary=True)

In [None]:
aft.print_summary()

In [None]:
print(aft.median_survival_time_)
print(aft.mean_survival_time_)

In [None]:
aft.plot()

In [None]:
df['latitude'].min()

In [None]:
aft.plot_partial_effects_on_outcome('latitude', values=np.arange(-2,2,0.5),cmap='coolwarm')

In [None]:
abertas = df[df['fechada'] == 0]

In [None]:
pred = aft.predict_median(abertas)

In [None]:
pred.max()

In [None]:
scores = k_fold_cross_validation(aft, df, duration_col='idade_anos', event_col='fechada', k=3, scoring_method="concordance_index")
print(scores)

In [None]:
ativas = df[df['fechada'] == 0].reset_index()

In [None]:
hazards = aft.predict_cumulative_hazard(ativas,times=[5,10,25,50],conditional_after=ativas['idade_anos']).T.reset_index()

In [None]:
hazards.columns

In [None]:
hazards

In [None]:
ativas['risco_5'] = hazards[5.0]
ativas['risco_10'] = hazards[10.0]
ativas['risco_25'] = hazards[25.0]
ativas['risco_50'] = hazards[50.0]


In [None]:
ativas['tempo_mediano'] = aft.predict_median(ativas, conditional_after=ativas['idade_anos'])

In [None]:
ativas = ativas[ativas['tempo_mediano'] < 100]

In [None]:
h3s = pd.read_parquet('./cnpjs_com_h3.parquet/')

In [None]:
ativas.columns
ativas = ativas[[ 'cnpj', 'risco_5',
       'risco_10', 'risco_25', 'risco_50', 'tempo_mediano', 'idade_anos']]

In [None]:
ativas

In [None]:
ativas.merge(h3s, on='cnpj').to_csv('./AFT_kepler.csv', index=False, mode='w')