In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sksurv.datasets import load_breast_cancer
from sksurv.linear_model import CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis
from sksurv.preprocessing import OneHotEncoder

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
data = pd.read_csv('./data/complete_dataset_v3.csv', index_col = 0)
data

In [None]:
data = data.rename(columns={"debtor_last_contact": "days_since_last_contact"})

In [None]:
data = data.drop([ 'action_Outgoing letter debtor - Notice of foreclosure on goods 2',
       'action_Outgoing letter debtor - Notice of foreclosure on goods 3'], axis = 1)

In [None]:
data.columns

In [None]:
labels = pd.read_csv('./data/labels.csv', index_col = 'dossier_nr')
labels

In [None]:
labels = pd.read_csv('./data/labels.csv', index_col=0)
labels = labels[labels.index.isin(data['dossier_nr'].unique())]
labels = labels.reset_index()
labels.head()

# Splitting train test dossiers

In [None]:
dossier_info = pd.read_excel('./data/debtors info.xlsx')
dossier_info = dossier_info[dossier_info.dossier_nr.isin(labels.dossier_nr)]

In [None]:
for col in ['datum_afsluiten']:
#     labels[col] = pd.to_datetime(labels[col])
    dossier_info[col] = pd.to_datetime(dossier_info[col])

In [None]:
labels = labels.sort_values(by='case_end')

In [None]:

sorterIndex = dict(zip(labels.dossier_nr, range(len(labels))))

In [None]:
data

In [None]:
data['Tm_Rank'] = data['dossier_nr'].map(sorterIndex)

In [None]:
data = data.sort_values(by='Tm_Rank')

In [None]:
data = data.drop(['Tm_Rank'], axis=1)

In [None]:
flip_time = np.datetime64('2020-10-06')

In [None]:
# labels_train = labels[labels.case_end < flip_time]
# labels_test = labels[labels.case_end >= flip_time]

In [None]:
dossiers_train = dossier_info[dossier_info.datum_afsluiten < flip_time] 
dossiers_test = dossier_info[dossier_info.datum_afsluiten >= flip_time]

In [None]:
data = data.drop('case_end', axis=1)

In [None]:
# data = data[data.duration <= 150]

In [None]:
train_data = data[data.dossier_nr.isin(dossiers_train.dossier_nr)]
test_data = data[(data.dossier_nr.isin(dossiers_test.dossier_nr))]

In [None]:
train_data.to_csv('./data/train_data.csv')
test_data.to_csv('./data/test_data.csv')

### Train set

In [None]:
X_train = train_data.drop(['label','duration'], axis = 1)
y_train = train_data.loc[:, ['label', 'duration']]

y_train['label'] = y_train['label'].apply(lambda x: True if x else False)
y_train = np.array(y_train.to_records(index=False))

### Test set

In [None]:
X_test = test_data.drop(['label','duration'], axis = 1)
y_test = test_data.loc[:, ['label', 'duration']]

y_test['label'] = y_test['label'].apply(lambda x: True if x else False)
y_test = np.array(y_test.to_records(index=False))

In [None]:
data = data.drop(['dossier_nr'], axis = 1)

In [None]:
dataset = data

In [None]:
# dataset = pd.concat([data, labels],  axis=1, join="inner")
# dataset

In [None]:
# X = dataset.copy().iloc[:, :-2]
# y = dataset.copy().iloc[:, -2:]

X = dataset.copy().drop(['label','duration'], axis = 1)
y = dataset.copy().loc[:, ['label', 'duration']]

y['label'] = y['label'].apply(lambda x: True if x else False)
y = np.array(y.to_records(index=False))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify = y['label'])

In [None]:
data

In [None]:
y_test

In [None]:
X_train

In [None]:
y_train

In [None]:
X.columns

In [None]:
X = X.drop(['debiteur_relatie_nr', 'days_since_last_payment', 'days_since_last_contact'], axis=1)

In [None]:
from sksurv.linear_model import CoxnetSurvivalAnalysis

In [None]:
cox_elastic_net = CoxnetSurvivalAnalysis(l1_ratio=0.99, fit_baseline_model=True)
cox_elastic_net.fit(X, y)

In [None]:
def plot_coefficients(coefs, n_highlight):
    _, ax = plt.subplots(figsize=(9, 6))
    n_features = coefs.shape[0]
    alphas = coefs.columns
    for row in coefs.itertuples():
        ax.semilogx(alphas, row[1:], ".-", label=row.Index)

    alpha_min = alphas.min()
    top_coefs = coefs.loc[:, alpha_min].map(abs).sort_values().tail(n_highlight)
    for name in top_coefs.index:
        coef = coefs.loc[name, alpha_min]
        plt.text(
            alpha_min, coef, name + "   ",
            horizontalalignment="right",
            verticalalignment="center"
        )

    ax.yaxis.set_label_position("right")
    ax.yaxis.tick_right()
    ax.grid(True)
    ax.set_xlabel("alpha")
    ax.set_ylabel("coefficient")

In [None]:
coefficients_elastic_net = pd.DataFrame(
    cox_elastic_net.coef_,
    index=X.columns,
    columns=np.round(cox_elastic_net.alphas_, 5)
)

plot_coefficients(coefficients_elastic_net, n_highlight=5)

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

coxnet_pipe = make_pipeline(
    StandardScaler(),
    CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.01, max_iter=100)
)
warnings.simplefilter("ignore", ConvergenceWarning)
coxnet_pipe.fit(X, y)

In [None]:
estimated_alphas = coxnet_pipe.named_steps["coxnetsurvivalanalysis"].alphas_
cv = KFold(n_splits=5, shuffle=True, random_state=0)
gcv = GridSearchCV(
    make_pipeline(StandardScaler(), CoxnetSurvivalAnalysis(l1_ratio=0.9)),
    param_grid={"coxnetsurvivalanalysis__alphas": [[v] for v in estimated_alphas]},
    cv=cv,
    error_score=0.5,
    n_jobs=-1).fit(X, y)

cv_results = pd.DataFrame(gcv.cv_results_)

In [None]:
X

In [None]:
alphas = cv_results.param_coxnetsurvivalanalysis__alphas.map(lambda x: x[0])
mean = cv_results.mean_test_score
std = cv_results.std_test_score

fig, ax = plt.subplots(figsize=(9, 6))
ax.plot(alphas, mean)
ax.fill_between(alphas, mean - std, mean + std, alpha=.15)
ax.set_xscale("log")
ax.set_ylabel("concordance index")
ax.set_xlabel("alpha")
ax.axvline(gcv.best_params_["coxnetsurvivalanalysis__alphas"][0], c="C1")
ax.axhline(0.5, color="grey", linestyle="--")
ax.grid(True)

In [None]:
import seaborn as sns
sns.set_theme(style="darkgrid")

In [None]:
best_model = gcv.best_estimator_.named_steps["coxnetsurvivalanalysis"]
best_coefs = pd.DataFrame(
    best_model.coef_,
    index=X.columns,
    columns=["coefficient"]
)

non_zero = np.sum(best_coefs.iloc[:, 0] != 0)
print("Number of non-zero coefficients: {}".format(non_zero))

non_zero_coefs = best_coefs.query("coefficient != 0")
coef_order = non_zero_coefs.abs().sort_values("coefficient").index

_, ax = plt.subplots(figsize=(6, 12))
non_zero_coefs.loc[coef_order].abs().plot.barh(ax=ax, legend=False)
ax.set_xlabel("Relative feature importance")
# ax.set_title('Top 25 features with highest relative feature  importance as calculated by the Elastic Net Cox Regression model')
# plt.tight_layout()
ax.grid(True)
plt.savefig('./figures/relative_feature_importance')

In [None]:
with pd.option_context("max_colwidth", 1000):
    print(non_zero_coefs.loc[coef_order].abs().iloc[::-1].to_latex())

In [None]:
non_zero_coefs

In [None]:
non_zero_coefs.to_csv('./data/non_zero_coefs.csv')

In [None]:
non_zero

In [None]:
non_zero_coefs.abs().sort_values("coefficient")

In [None]:
X

In [None]:
len(best_model.coef_)

In [None]:
X.std()

In [None]:
X.toarray()

In [None]:
len(X)

In [None]:
best_model.coef_.flatten()

In [None]:
scaler = StandardScaler()
p = scaler.fit_transform(X)
X_imputed_df = pd.DataFrame(p, columns = X.columns)

In [None]:
pd.DataFrame(X_imputed_df.std()*best_model.coef_.flatten()).sort_values(by=0)

In [None]:
print(X.std()*best_model.coef_.flatten())