In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [None]:
# Reads csv and remove some data
df = pd.read_csv("diabetes.csv")

# I am trying to optimize
# BloodPressure of 0 is insane an means death - so values with 0 are equal to Null
mean_bp = df[df["BloodPressure"] != 0]["BloodPressure"].mean(
    skipna=True,
)
mean_bp = round(mean_bp)

df["BloodPressure"] = df["BloodPressure"].replace(0, mean_bp)

# Same as for BloodPressure, 0 Insulin is nonsense. Not sure if mean is the right way to go
# mean feels counter intuitive but i have no better idea at the moment
mean_in = df[df["Insulin"] != 0]["Insulin"].mean(skipna=True)
mean_in = round(mean_in)

df["Insulin"] = df["Insulin"].replace(0, mean_in)

df = df.drop(columns=["SkinThickness"])
df

In [None]:
# Split data in X and Y to separate into Features and Outcome. Than split each again in train and test.
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializes the RandomForest (I have no idea what parameters work best or what arguments to change/tune,
# there are a billion of them and i have no idea what to do, but i guess that's the exercise.)
# Estimators and random_state is suggested by gpt
clf = RandomForestClassifier(n_estimators=200, random_state=42)

# Fits  evaluating the Model
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
rfc = RandomForestClassifier(random_state=37)
param_grid = {
    "n_estimators": [100, 250, 500],
    # "bootstrap": [True, False], # False
    # "oob_score": [], # Only without cross validation
    "max_depth": [9, 10, 11],  # Around 10
    "min_samples_split": [2, 3, 5, 6, 7, 8, 10],
    "min_samples_leaf": [1, 2, 3],  # Around 1
    "max_features": ["log2", "sqrt"],  # Both viable
    "max_leaf_nodes": [40, 50, 60],  # Around 50
}

rfr_grid = GridSearchCV(rfc, param_grid, cv=3, n_jobs=-1)
rfr_grid.fit(X_train, y_train)

In [None]:
print(rfr_grid.best_score_)
results = pd.DataFrame(rfr_grid.cv_results_)
print(results.info())
print(results.describe())
results = results.sort_values(by=["mean_test_score"], ascending=False)
results.head()

In [None]:
# NOTE some experimenting for fun


df = pd.read_csv("diabetes.csv")

# I want to see if i can predict the missing values in BP and IN
# I think this could result in better data and ultimately in a better model to predict diabetes
# I start with BP

df = df.drop(columns=["Insulin"])
df = df.drop(columns=["SkinThickness"])

data = df[df["BloodPressure"] != 0]
ToPredict = df[df["BloodPressure"] == 0]


X = data.drop(columns=["BloodPressure"])
y = data["BloodPressure"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializes the RandomForest (I have no idea what parameters work best or what arguments to change/tune,
# there are a billion of them and i have no idea what to do, but i guess that's the exercise.)
# Estimators and random_state is suggested by gpt
clf = RandomForestRegressor(n_estimators=3000, random_state=42)

# Fits  evaluating the Model
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)


def lenient_accuracy(y_true, y_pred):
    """Calculates the percentage of predictions within 5 points of the true value."""
    return np.mean(np.abs(y_true - y_pred) <= 5)


# Calculate and print lenient accuracy
lenient_score = lenient_accuracy(y_test, y_pred)
print(f"Lenient Accuracy: {lenient_score:.2f}")

In [None]:
# Calculate the absolute differences between true and predicted values
errors = np.abs(y_test - y_pred)

# Get the errors for predictions OUTSIDE the lenient range
large_errors = errors[errors > 5]

# 1. Calculate the mean blood pressure
mean_bp = np.mean(y_train)  # Calculate mean from training data

# 2. Predict using the mean for all test instances
mean_predictions = np.full_like(y_test, mean_bp)

# 3. Calculate errors using the mean prediction
mean_errors = np.abs(y_test - mean_predictions)

# 4. Get errors outside the lenient range for the mean predictions
mean_large_errors = mean_errors[mean_errors > 5]

# 5. Analyze and compare
print("Random Forest - Number of predictions outside lenient range:", len(large_errors))
print("Mean Prediction - Number of predictions outside lenient range:", len(mean_large_errors))

print("Random Forest - Average error for those predictions:", np.mean(large_errors))
print("Mean Prediction - Average error for those predictions:", np.mean(mean_large_errors))

# 6. (Optional) Visual comparison
plt.hist(large_errors, bins=20, alpha=0.5, label="Random Forest Errors")
plt.hist(mean_large_errors, bins=20, alpha=0.5, label="Mean Prediction Errors")
plt.xlabel("Error")
plt.ylabel("Frequency")
plt.title("Comparison of Errors Outside Lenient Range")
plt.legend()
plt.show()