In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.14.0


## Training and test data

In [21]:

cancer_type_columns = [
   "Neoplasm Disease Stage American Joint Committee on Cancer Code", 
   "Neoplasm American Joint Committee on Cancer Clinical Distant Metastasis M Stage", 
   "American Joint Committee on Cancer Metastasis Stage Code", 
   "American Joint Committee on Cancer Lymph Node Stage Code.1", 
   "American Joint Committee on Cancer Lymph Node Stage Code", 
   "Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code", 
   "Neoplasm Disease Stage American Joint Committee on Cancer Code.1", 
   "American Joint Committee on Cancer Publication Version Type", 
   "American Joint Committee on Cancer Tumor Stage Code"
]

pd_df_original = pd.read_csv(
    "./Testicular Cancer Dataset.csv",
    # names=[*cancer_type_columns, "Disease Free (Months)"]
    usecols=[*cancer_type_columns, 'Disease Free (Months)', 'Postoperative tx'],#, 'Disease Free Status']
    na_values=['NA'],
    keep_default_na=False
)

print(len(pd_df_original))

pd_df_original = pd_df_original.dropna()

for row in pd_df_original.iterrows():
    print('Row:', row)

print(len(pd_df_original))

pd_df = pd_df_original.copy()

# target = pd_df.apply(lambda p: p['Disease Free (Months)'] * (1 if p['Disease Free Status'] == '1:Recurred/Progressed' else 1), axis=1)

target = pd_df.pop('Disease Free (Months)')
# pd_df.pop('Disease Free Status')

# target = np.log2(target)

inputs = {}

for name, column in pd_df.items():
    dtype = column.dtype
    if dtype == object:
        dtype = tf.string
    else:
        dtype = tf.float32

    inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)

# Vi har ingen numeric inputs pr no
# numeric_inputs = {name:input for name,input in inputs.items()
#                   if input.dtype==tf.float32}
# x = tf.keras.layers.Concatenate()(list(numeric_inputs.values()))
# norm = tf.keras.layers.Normalization()
# norm.adapt(np.array(pd_df_original[numeric_inputs.keys()]))
# all_numeric_inputs = norm(x)

preprocessed_inputs = [] # [all_numeric_inputs]

for name, input in inputs.items():
  if input.dtype == tf.float32:
    continue

  lookup = tf.keras.layers.StringLookup(vocabulary=np.unique(pd_df[name]))
  one_hot = tf.keras.layers.CategoryEncoding(num_tokens=lookup.vocabulary_size())

  x = lookup(input)
  x = one_hot(x)
  preprocessed_inputs.append(x)

# preprocessed_inputs

preprocessed_inputs_cat = tf.keras.layers.Concatenate()(preprocessed_inputs)

preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

# Nå visualiserings greie
# tf.keras.utils.plot_model(model = titanic_preprocessing , rankdir="LR", dpi=72, show_shapes=True)


# Split training og testing data
# Random state 5 e dårlig, 19 og 4 e ganske bra, 2 e hinsides bra!
random_state=2

print(len(pd_df))

train_input, test_input = train_test_split(pd_df, test_size=0.2, random_state=random_state)
train_output, test_output = train_test_split(target, test_size=0.2, random_state=random_state)

train_input = {name: np.array(value) for name, value in train_input.items()}
test_input = {name: np.array(value) for name, value in test_input.items()}

# Must split my_features_dict into training and testing data
# Split on the same indexes with target, my_features_dict[0] corresponds to target[0] and so on

def model_func(preprocessing, inputs):
    body = tf.keras.Sequential([
        tf.keras.layers.Dense(16),
        tf.keras.layers.Dense(1)
    ])

    preprocessed_inputs = preprocessing(inputs)
    result = body(preprocessed_inputs)
    model = tf.keras.Model(inputs, result)

    model.compile(
        loss=tf.keras.losses.MeanAbsolutePercentageError(),
        optimizer=tf.keras.optimizers.legacy.Adam()
    )
    return model

model = model_func(preprocessing, inputs)

print(f'Training set length: {len(train_output)}, Test set length: {len(test_output)}')

156
Row: (0, Neoplasm Disease Stage American Joint Committee on Cancer Code                     Stage IS
Neoplasm American Joint Committee on Cancer Clinical Distant Metastasis M Stage          M0
American Joint Committee on Cancer Metastasis Stage Code                                 M0
American Joint Committee on Cancer Lymph Node Stage Code.1                               N0
American Joint Committee on Cancer Lymph Node Stage Code                                 T1
Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code                N0
Neoplasm Disease Stage American Joint Committee on Cancer Code.1                         IS
American Joint Committee on Cancer Publication Version Type                             7th
American Joint Committee on Cancer Tumor Stage Code                                      T1
Disease Free (Months)                                                                   4.7
Postoperative tx                                                   

TypeError: '<' not supported between instances of 'str' and 'float'

In [153]:
model.fit(x=train_input, y=train_output, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x29f2252e0>

In [151]:
model.evaluate(test_input, test_output)



97.04496765136719

In [1]:
results = []

print('Using random_state: ', random_state)

results.append((model.evaluate(train_input, train_output), model.evaluate(test_input, test_output)))
for i in range(100):
    model.fit(x=train_input, y=train_output, epochs=10, verbose=0)
    results.append((model.evaluate(train_input, train_output), model.evaluate(test_input, test_output)))

fig, ax = plt.subplots()
# ax.plot(range(100), modelResTo100)
ax.plot([r[0] for r in results], 'o')
ax.plot([r[1] for r in results], 'o')

plt.show()

NameError: name 'random_state' is not defined