In [1]:
from sklearn.model_selection import cross_val_score
from sklearn import tree
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score  # metrics for manual model eval
import mlflow.pyfunc
import mlflow

In [10]:
tsv1 = pd.read_csv(r'C:\Users\s434037\Desktop\Bachelor\data\labels.tsv', encoding='utf-8', sep='\t') #encoding and sep to read tsv correctly
tsv2 = pd.read_csv(r'C:\Users\s434037\Desktop\Bachelor\data\prostate_stats.tsv', encoding='utf-8', sep='\t') #encoding and sep to read tsv correctly

patient_data = pd.merge(tsv1, tsv2, left_on=['pseudo_id'], right_on=['pid'], how='inner') # merge both dataframes on pseudo_id and pid
print (patient_data)

      pseudo_id  age sex  staging  px     psa  label  pseudo_patid    set  \
0         74404   75   M       re   0    0.09      2         14507  train   
1         29868   64   M  primary   0    5.14      1         66373  train   
2         97165   80   M       re   0   23.20      1         88006    val   
3         40327   71   M       re   1    1.80      0         41467  train   
4         84824   71   M       re   1    1.05      0         50475  train   
...         ...  ...  ..      ...  ..     ...    ...           ...    ...   
1199      16860   68   M       re   1    0.44      1         22584  train   
1200      17064   61   M       re   1  159.00      0         92324  train   
1201      47224   65   M       re   0    2.58      1         44278  train   
1202       8041   75   M       re   0   79.00      1         77358  train   
1203      85148   78   M       re   0   53.91      1         33053  train   

        pid  ...  vol_pix       vol_mm3      mean        sd      cx_px  \
0

In [None]:
patient_data = pd.read_csv(r'C:\Users\s434037\Desktop\Bachelor\data\labels.tsv', encoding='utf-8', sep='\t') #encoding and sep to read tsv correctly


      pseudo_id  age sex  staging  px     psa  label  pseudo_patid    set
0         74404   75   M       re   0    0.09      2         14507  train
1         29868   64   M  primary   0    5.14      1         66373  train
2         97165   80   M       re   0   23.20      1         88006    val
3         40327   71   M       re   1    1.80      0         41467  train
4         84824   71   M       re   1    1.05      0         50475  train
...         ...  ...  ..      ...  ..     ...    ...           ...    ...
1199      16860   68   M       re   1    0.44      1         22584  train
1200      17064   61   M       re   1  159.00      0         92324  train
1201      47224   65   M       re   0    2.58      1         44278  train
1202       8041   75   M       re   0   79.00      1         77358  train
1203      85148   78   M       re   0   53.91      1         33053  train

[1204 rows x 9 columns]


In [11]:

patient_data = patient_data.dropna() # Drop rows with missing values for simplicity 
patient_data = patient_data.drop(columns=['pseudo_id', 'sex', 'pseudo_patid', 'pid', 'cx_px', 'cy_px', 'cz_px', 'cx', 'cy', 'cz']) # Drop patient_id as it's not a feature for prediction
patient_data = patient_data[patient_data.label != 2] # Remove rows with label 2 as these are not relevant for binary classification
patient_data = patient_data[patient_data.psa != 'NA'] # remove rows with no psa value till i find a better solution
patient_data = patient_data[patient_data.staging != 'primary'] # remove rows with primary staging till i find a better solution

patient_data['age'] = patient_data['age'].astype(float) # convert psa to float
patient_data['px'] = patient_data['px'].astype(float) # convert psa to float

patient_data

Unnamed: 0,age,staging,px,psa,label,set,min,max,rmin,rmax,vol_pix,vol_mm3,mean,sd
2,80.0,re,0.0,23.20,1,val,0.518298,32.973873,0.842853,25.833645,351,29111.843750,7.197718,6.139901
3,71.0,re,1.0,1.80,0,train,0.908734,4.015200,1.008141,3.949964,134,11113.924805,2.292442,0.624569
4,71.0,re,1.0,1.05,0,train,0.556755,3.191937,0.630540,2.483073,192,15924.427734,1.324341,0.416383
5,67.0,re,1.0,1.47,0,val,0.754733,3.845006,0.776365,3.736846,149,12358.018555,1.786344,0.642492
6,53.0,re,1.0,1.40,0,train,0.407404,7.776496,0.525310,6.671132,254,21066.691406,1.977446,1.426520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199,68.0,re,1.0,0.44,1,train,0.374197,34.756958,0.408580,31.181149,157,13021.537109,4.603152,6.414782
1200,61.0,re,1.0,159.00,0,train,0.508026,3.091441,0.508026,3.091441,72,3582.996582,1.262617,0.597831
1201,65.0,re,0.0,2.58,1,train,0.624586,73.843353,0.624586,16.000526,253,12590.250000,2.403591,5.675744
1202,75.0,re,0.0,79.00,1,train,0.655945,23.050591,1.014260,19.243502,349,28945.964844,7.526070,4.826182


In [12]:
train_mask = patient_data['set'] == 'train' 
test_mask = patient_data['set'] == 'val'


In [13]:
X = pd.get_dummies(patient_data.drop("label", axis=1)) # dummies for categorical variables since forest doesn't handle them directly
X.index = patient_data.index
y = patient_data[["label"]].astype(int) # Keeping y as DataFrame for easier handling of set indicators

In [14]:
X_train, y_train = X[train_mask], y[train_mask]
X_test, y_test = X[test_mask], y[test_mask]
print(f"Train shape: {X_train.columns} {y_train.shape}") #double check proper set splits


Train shape: Index(['age', 'px', 'psa', 'min', 'max', 'rmin', 'rmax', 'vol_pix', 'vol_mm3',
       'mean', 'sd', 'staging_re', 'set_train', 'set_val'],
      dtype='object') (706, 1)


In [15]:
X_train = X_train.drop(columns=['set_train', 'set_val'], errors= 'ignore') # Drop the set indicator columns
X_test = X_test.drop(columns=['set_train', 'set_val'], errors = 'ignore')

print(X_test.shape, X_train.shape)

(166, 12) (706, 12)


In [16]:
y_test = np.array(y_test).astype(int) # Convert y_test to a NumPy array of strings
y_train = np.array(y_train).astype(int) # Convert y_train to a NumPy array of strings

y_test = y_test.squeeze()
y_train = y_train.squeeze()

In [None]:
print("Unique labels in y_train:", np.unique(y_train))
print("Unique labels in y_test:", np.unique(y_test))

if X_train.empty or X_test.empty:
    raise ValueError("One of the datasets (train/test) is empty!")

Unique labels in y_train: [0 1]
Unique labels in y_test: [0 1]


In [18]:
if len(X_train) != len(y_train):
    print("Mismatch between X_train and y_train lengths!")
    raise SystemExit()
if len(X_test) != len(y_test):
    print("Mismatch between X_test and y_test lengths!")
    raise SystemExit()

In [None]:
forest_params = {
    "min_weight_fraction_leaf": 0.1,
    "n_estimators": 30,
}

In [19]:
param_grid = {
    'min_samples_split': [2, 3, 4],
    'min_weight_fraction_leaf': [0.1],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'max_leaf_nodes': [None, 10, 20],
    
}

In [20]:
rf_classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, n_jobs=-1, scoring='f1', error_score='raise')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
model = RandomForestClassifier(**best_params)
model = model.fit(X_train, y_train)

In [21]:
model = RandomForestClassifier(**forest_params) # Initialize the random forest with specified parameters
model = model.fit(X_train, y_train)


NameError: name 'forest_params' is not defined

In [18]:
model_path = r"C:/Users/s434037/Desktop/Bachelor/projects/mlruns/480120777834226073/models/m-fa63154747d64c93ab3f4c76ace550db/artifacts"
model = mlflow.sklearn.load_model (model_path)



In [14]:
model_uri = f"models:/randomForestV1/version-1"
model = mlflow.sklearn.load_model(model_uri)

MlflowException: Invalid Model Version stage: version-1. Value must be one of None, Staging, Production, Archived.

In [22]:
eval_data= X_test.copy()
eval_data['label']= y_test #create eval data for flow evaluation

In [23]:
y_pred = model.predict(X_test) # Make predictions on the test set
metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="weighted"), 
        "recall": recall_score(y_test, y_pred, average="weighted"),
        "f1_score": f1_score(y_test, y_pred, average='weighted'),
    }

print("Classification Report:\n", classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.89      0.81        98
           1       0.78      0.57      0.66        68

    accuracy                           0.76       166
   macro avg       0.77      0.73      0.74       166
weighted avg       0.76      0.76      0.75       166



In [24]:
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.3f}")
print(f"Test score: {best_score:.3f}")

Best parameters: {'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.1}
Best cross-validation score: 0.718
Test score: 0.718


In [25]:
result = mlflow.models.evaluate(
    model_path,
    eval_data,
    targets="label",
    model_type="classifier",
)

# Access metrics
print(f"Precision: {result.metrics['precision_score']:.3f}")
print(f"Recall: {result.metrics['recall_score']:.3f}")
print(f"F1 Score: {result.metrics['f1_score']:.3f}")
print(f"ROC AUC: {result.metrics['roc_auc']:.3f}")

NameError: name 'model_path' is not defined