In [None]:
# import packages
import numpy as np
import pandas as pd
import pyreadstat
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# load data
ziekte_data = pd.read_excel("ziekte_lang.xlsx")
sympro_data, meta = pyreadstat.read_sav("20230321_SYMPRO.sav")
behandeling_data = pd.read_excel("behandeling_lang.xlsx")

In [None]:
# choose columns for the model
treatment_data = behandeling_data.drop(["event_key", "behandeling", "startdatum_behandeling", "stopdatum_behandeling", 
                                        "behandeling_categorie", "epis_nr"], axis = 1)
treatment_data = treatment_data.groupby(["sympro_respondent", "startdatum_behandelcombinatie", 
                                         "stopdatum_behandelcombinatie"]).first().reset_index()

symptoms_data = sympro_data.loc[:, ["respondent", "bas_dem_age", "bas_dem_gender", "t0crf_klinv1", "mutatie_cat", "T0_comb"]]
symptoms_data = symptoms_data.rename(columns={"respondent": "sympro_respondent"})
# encode age into bins
bins = [18, 44, 54, 65, 100]
labels = ['under 45', '45-54', '55-65', 'over 65']
symptoms_data["bas_dem_age"] = pd.cut(symptoms_data["bas_dem_age"], bins=bins, labels=labels, right=True, ordered=True)

recurrence_data = ziekte_data.loc[:, ["sympro_respondent", "event_date", "stadium_tnm7_8", "recurrence"]]

In [None]:
# merge datasets
merged_data = pd.merge(treatment_data, recurrence_data, on='sympro_respondent', how='inner')
merged_data = pd.merge(merged_data, symptoms_data, on='sympro_respondent', how='inner')

# t0 - three months
def after_three_months(participant):
    start_date = participant['event_date'].min()
    cutoff = start_date + pd.DateOffset(months=3)
    filtered = participant[participant['event_date'] < cutoff]
    filtered = filtered[filtered['startdatum_behandelcombinatie'] < cutoff]
    return filtered

merged_data_t0 = merged_data.groupby("sympro_respondent", group_keys=False).apply(after_three_months)
merged_data_t0 = merged_data_t0.groupby('sympro_respondent').agg(lambda x: ', '.join(sorted(set(x.astype(str))))).reset_index()

# t1 - six months
def after_six_months(participant):
    start_date = participant['event_date'].min()
    cutoff = start_date + pd.DateOffset(months=6)
    filtered = participant[participant['event_date'] < cutoff]
    filtered = filtered[filtered['startdatum_behandelcombinatie'] < cutoff]
    return filtered

merged_data_t1 = merged_data.groupby("sympro_respondent", group_keys=False).apply(after_six_months)
merged_data_t1 = merged_data_t1.groupby('sympro_respondent').agg(lambda x: ', '.join(sorted(set(x.astype(str))))).reset_index()

# t2 - a year
def after_year(participant):
    start_date = participant['event_date'].min()
    cutoff = start_date + pd.DateOffset(months=13) #to include edge dates
    filtered = participant[participant['event_date'] < cutoff]
    filtered = filtered[filtered['startdatum_behandelcombinatie'] < cutoff]
    return filtered

merged_data_t2 = merged_data.groupby("sympro_respondent", group_keys=False).apply(after_year)
merged_data_t2 = merged_data_t2.groupby('sympro_respondent').agg(lambda x: ', '.join(sorted(set(x.astype(str))))).reset_index()

In [None]:
# helper function
def get_last_valid_value(column):
    parts = [x.strip() for x in column.split(', ') if x.strip().lower() != 'nan']
    return parts[-1] if parts else 4

In [None]:
#t0
t0 = merged_data_t0.loc[:, ["sympro_respondent"]]
t0['Age'] = merged_data_t0['bas_dem_age']#.apply(lambda x: float(x)).astype(int)
t0['Sex'] = merged_data_t0['bas_dem_gender'].apply(lambda x: float(x)).astype(int)
t0['Stage'] = merged_data_t0['stadium_tnm7_8'].apply(get_last_valid_value).astype(float).astype(int)
t0['Treatment_change'] = merged_data_t0['behandelcombi_binnen_episode'].apply(lambda x: 1 if len(str(x).split(', ')) > 1 else 0).astype(int)
t0['Tumor_mutation'] = merged_data_t0['mutatie_cat'].apply(lambda x: 0.0 if x.strip().lower() == 'nan' else float(x)).astype(int)
t0['Histology'] = merged_data_t0['t0crf_klinv1'].apply(lambda x: float(x)).astype(int)
t0['Other_conditions'] = merged_data_t0['T0_comb'].apply(lambda x: 0.0 if x.strip().lower() == 'nan' else float(x)).astype(int)
t0['Recurrence'] = merged_data_t0['recurrence'].apply(get_last_valid_value).astype(float).astype(int)

#t1
t1 = merged_data_t1.loc[:, ["sympro_respondent"]]
t1['Age'] = merged_data_t1['bas_dem_age']#.apply(lambda x: float(x)).astype(int)
t1['Sex'] = merged_data_t1['bas_dem_gender'].apply(lambda x: float(x)).astype(int)
t1['Stage'] = merged_data_t1['stadium_tnm7_8'].apply(get_last_valid_value).astype(float).astype(int)
t1['Treatment_change'] = merged_data_t1['behandelcombi_binnen_episode'].apply(lambda x: 1 if len(str(x).split(', ')) > 1 else 0).astype(int)
t1['Tumor_mutation'] = merged_data_t1['mutatie_cat'].apply(lambda x: 0.0 if x.strip().lower() == 'nan' else float(x)).astype(int)
t1['Histology'] = merged_data_t1['t0crf_klinv1'].apply(lambda x: float(x)).astype(int)
t1['Other_conditions'] = merged_data_t1['T0_comb'].apply(lambda x: 0.0 if x.strip().lower() == 'nan' else float(x)).astype(int)
t1['Recurrence'] = merged_data_t1['recurrence'].apply(get_last_valid_value).astype(float).astype(int)

#t2
t2 = merged_data_t2.loc[:, ["sympro_respondent"]]
t2['Age'] = merged_data_t2['bas_dem_age']#.apply(lambda x: float(x)).astype(int)
t2['Sex'] = merged_data_t2['bas_dem_gender'].apply(lambda x: float(x)).astype(int)
t2['Stage'] = merged_data_t2['stadium_tnm7_8'].apply(get_last_valid_value).astype(float).astype(int)
t2['Treatment_change'] = merged_data_t2['behandelcombi_binnen_episode'].apply(lambda x: 1 if len(str(x).split(', ')) > 1 else 0).astype(int)
t2['Tumor_mutation'] = merged_data_t2['mutatie_cat'].apply(lambda x: 0.0 if x.strip().lower() == 'nan' else float(x)).astype(int)
t2['Histology'] = merged_data_t2['t0crf_klinv1'].apply(lambda x: float(x)).astype(int)
t2['Other_conditions'] = merged_data_t2['T0_comb'].apply(lambda x: 0.0 if x.strip().lower() == 'nan' else float(x)).astype(int)
t2['Recurrence'] = merged_data_t2['recurrence'].apply(get_last_valid_value).astype(float).astype(int)

In [None]:
# plot age
plt.figure(figsize=(5, 3))
t0['Age'].value_counts().sort_index().plot(
    kind='bar', color='skyblue', edgecolor='black')
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# plot stage
plt.figure(figsize=(5, 3))
t0['Stage'].value_counts().sort_index().plot(
    kind='bar', color='skyblue', edgecolor='black')
plt.title("Stage Distribution")
plt.xlabel("Stage")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# plot sex
plt.figure(figsize=(5, 3))
t0['Sex'].value_counts().sort_index().plot(
    kind='bar', color='skyblue', edgecolor='black')
plt.title("Sex Distribution")
plt.xlabel("Sex (0 for Female, 1 for Male)")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# plot comorbidities
plt.figure(figsize=(5, 3))
t0['Other_conditions'].value_counts().sort_index().plot(
    kind='bar', color='skyblue', edgecolor='black')
plt.title("Comorbidities Distribution")
plt.xlabel("Comorbidities (1 for No, 2 for Yes)")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# plot tumour mutation
plt.figure(figsize=(5, 3))
t0['Tumor_mutation'].value_counts().sort_index().plot(
    kind='bar', color='skyblue', edgecolor='black')
plt.title("Tumour Mutation Distribution")
plt.xlabel("Tumour Mutation")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# plot histology
plt.figure(figsize=(5, 3))
t0['Histology'].value_counts().sort_index().plot(
    kind='bar', color='skyblue', edgecolor='black')
plt.title("Histology Distribution (0 for NSCLC)")
plt.xlabel("Histology")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# plot treatment_change at each point
fig, axes = plt.subplots(1, 3, figsize=(12,4))
for i, ax in enumerate(axes):
    if i == 0:
        t0['Treatment_change'].value_counts().plot(kind='bar', color='skyblue', edgecolor='black', ax=ax)
    if i == 1:
        t1['Treatment_change'].value_counts().plot(kind='bar', color='skyblue', edgecolor='black', ax=ax)
    if i == 2:
        t2['Treatment_change'].value_counts().plot(kind='bar', color='skyblue', edgecolor='black', ax=ax)
    ax.set_title(f"Treatment Change Distribution at T{i}")
    ax.set_xlabel("Treatment Change (0 for No, 1 for Yes)")
    ax.set_ylabel("Count")
    ax.set_xticklabels([0, 1], rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# plot treatment_change at each point
fig, axes = plt.subplots(1, 3, figsize=(12,4))
for i, ax in enumerate(axes):
    if i == 0:
        t0['Recurrence'].value_counts().plot(kind='bar', color='skyblue', edgecolor='black', ax=ax)
    if i == 1:
        t1['Recurrence'].value_counts().plot(kind='bar', color='skyblue', edgecolor='black', ax=ax)
    if i == 2:
        t2['Recurrence'].value_counts().plot(kind='bar', color='skyblue', edgecolor='black', ax=ax)
    ax.set_title(f"Recurrence Distribution at T{i}")
    ax.set_xlabel("Recurrence (0 for No, 1 for Yes)")
    ax.set_ylabel("Count")
    ax.set_xticklabels([0, 1], rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# plot correlations for t0
mapping = {"under 45": 1, "45-54": 2, "55-65": 3, "over 65": 4}
t0["Age"] = t0["Age"].map(mapping)
corr = t0.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix for T0')
plt.show()

In [None]:
# plot correlations for t1
mapping = {"under 45": 1, "45-54": 2, "55-65": 3, "over 65": 4}
t1["Age"] = t1["Age"].map(mapping)
corr = t1.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix for T1')
plt.show()

In [None]:
# plot correlations for t2
mapping = {"under 45": 1, "45-54": 2, "55-65": 3, "over 65": 4}
t2["Age"] = t2["Age"].map(mapping)
corr = t2.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix for T2')
plt.show()

In [None]:
# plot correlations for t2 without mutation
test = t2.drop('Tumor_mutation', axis=1)
corr = test.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix for T2 without Tumor_mutation')
plt.show()

In [None]:
# plot strongest correlations
fig, axes = plt.subplots(2, 3, figsize=(12,8))
sns.countplot(data=t0, ax=axes[0,0], x='Stage', hue='Recurrence')
sns.countplot(data=t1, ax=axes[0,1], x='Stage', hue='Recurrence')
sns.countplot(data=t2, ax=axes[0,2], x='Stage', hue='Recurrence')
sns.countplot(data=t0, ax=axes[1,0], x='Treatment_change', hue='Recurrence')
sns.countplot(data=t1, ax=axes[1,1], x='Treatment_change', hue='Recurrence')
sns.countplot(data=t2, ax=axes[1,2], x='Treatment_change', hue='Recurrence')
plt.tight_layout
plt.show()

In [None]:
# plot weakest correlation
fig, axes = plt.subplots(1, 3, figsize=(12,4))
sns.countplot(data=t0, ax=axes[0], x='Sex', hue='Recurrence')
sns.countplot(data=t1, ax=axes[1], x='Sex', hue='Recurrence')
sns.countplot(data=t2, ax=axes[2], x='Sex', hue='Recurrence')
plt.tight_layout
plt.show()