In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
import pickle

# Load the dataset
df = pd.read_csv('cleaned_dataset.csv')

# Split the dataset into a training set and a testing set
X = df[['Cancer Site', 'Year', 'Sex', 'Survival Rate', 'Annual Increase', 'Initial Cost']]
y = df['Total Costs']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical features using One-Hot Encoding
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_train_encoded = encoder.fit_transform(X_train)

# Save the encoding of the training set
encoder_filename = 'encoder.pkl'
with open(encoder_filename, 'wb') as f:
    pickle.dump(encoder, f)

# Use the saved encoding to encode the test set
X_test_encoded = encoder.transform(X_test)

# Remove the unknown categories from the test set
X_test = X_test[X_test['Cancer Site'].isin(encoder.categories_[0])]
X_test_encoded = encoder.transform(X_test)

# Fit a Decision Tree Regression model on the training set
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train_encoded, y_train)

# Evaluate the performance of the model on the testing set
y_pred = model.predict(X_test_encoded)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse:.14f}, R2: {r2:.14f}")

# Use the trained model to make predictions for a new patient
new_patient = {'Cancer Site': 0, 'Year': 2022, 'Sex': 0, 'Survival Rate': 3, 'Annual Increase': 5, 'Initial Cost': 3.82945393236263}
new_patient_encoded = encoder.transform(pd.DataFrame(new_patient, index=[0]))
new_patient_cost = model.predict(new_patient_encoded)[0]
print(f"Total cost of cancer treatment for the new patient: {new_patient_cost:.14f}")

MSE: 0.00265739503739, R2: 0.99679599133630
Total cost of cancer treatment for the new patient: 4.41161986792209
