In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [2]:
# Reads csv and remove some data
df = pd.read_csv("diabetes.csv")

# I am trying to optimize
# BloodPressure of 0 is insane an means death - so values with 0 are equal to Null
mean_bp = df[df["BloodPressure"] != 0]["BloodPressure"].mean(
    skipna=True,
)
mean_bp = round(mean_bp)

df["BloodPressure"] = df["BloodPressure"].replace(0, mean_bp)

# Same as for BloodPressure, 0 Insulin is nonsense. Not sure if mean is the right way to go
# mean feels counter intuitive but i have no better idea at the moment
mean_in = df[df["Insulin"] != 0]["Insulin"].mean(skipna=True)
mean_in = round(mean_in)

df["Insulin"] = df["Insulin"].replace(0, mean_in)

df = df.drop(columns=["SkinThickness"])
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,156,33.6,0.627,50,1
1,1,85,66,156,26.6,0.351,31,0
2,8,183,64,156,23.3,0.672,32,1
3,1,89,66,94,28.1,0.167,21,0
4,0,137,40,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,10,101,76,180,32.9,0.171,63,0
764,2,122,70,156,36.8,0.340,27,0
765,5,121,72,112,26.2,0.245,30,0
766,1,126,60,156,30.1,0.349,47,1


In [3]:
# Split data in X and Y to separate into Features and Outcome. Than split each again in train and test.
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializes the RandomForest (I have no idea what parameters work best or what arguments to change/tune,
# there are a billion of them and i have no idea what to do, but i guess that's the exercise.)
# Estimators and random_state is suggested by gpt
clf = RandomForestClassifier(n_estimators=200, random_state=42)

# Fits  evaluating the Model
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

In [4]:
rfc = RandomForestClassifier(random_state=37)
param_grid = {
    "n_estimators": [100, 250, 500],
    # "bootstrap": [True, False], # False
    # "oob_score": [], # Only without cross validation
    "max_depth": [9, 10, 11],  # Around 10
    "min_samples_split": [2, 3, 5, 6, 7, 8, 10],
    "min_samples_leaf": [1, 2, 3],  # Around 1
    "max_features": ["log2", "sqrt"],  # Both viable
    "max_leaf_nodes": [40, 50, 60],  # Around 50
}

rfr_grid = GridSearchCV(rfc, param_grid, cv=3, n_jobs=-1)
rfr_grid.fit(X_train, y_train)

In [None]:
print(rfr_grid.best_score_)
results = pd.DataFrame(rfr_grid.cv_results_)
print(results.info())
print(results.describe())
results = results.sort_values(by=["mean_test_score"], ascending=False)
results.head()

In [None]:
# NOTE some experimenting for fun


df = pd.read_csv("diabetes.csv")

# I want to see if i can predict the missing values in BP and IN
# I think this could result in better data and ultimately in a better model to predict diabetes
# I start with BP

df = df.drop(columns=["Insulin"])
df = df.drop(columns=["SkinThickness"])

data = df[df['BloodPressure'] != 0]
ToPredict = df[df['BloodPressure'] == 0]



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


X = data.drop(columns=['BloodPressure'])
y = data['BloodPressure']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializes the RandomForest (I have no idea what parameters work best or what arguments to change/tune,
# there are a billion of them and i have no idea what to do, but i guess that's the exercise.)
# Estimators and random_state is suggested by gpt
clf = RandomForestRegressor(n_estimators=3000, random_state=42)

# Fits  evaluating the Model
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)


def lenient_accuracy(y_true, y_pred):
    """Calculates the percentage of predictions within 5 points of the true value."""
    return np.mean(np.abs(y_true - y_pred) <= 5)


# Calculate and print lenient accuracy
lenient_score = lenient_accuracy(y_test, y_pred)
print(f"Lenient Accuracy: {lenient_score:.2f}")

In [5]:
# Calculate the absolute differences between true and predicted values
errors = np.abs(y_test - y_pred)

# Get the errors for predictions OUTSIDE the lenient range
large_errors = errors[errors > 5]

# 1. Calculate the mean blood pressure
mean_bp = np.mean(y_train)  # Calculate mean from training data

# 2. Predict using the mean for all test instances
mean_predictions = np.full_like(y_test, mean_bp)

# 3. Calculate errors using the mean prediction
mean_errors = np.abs(y_test - mean_predictions)

# 4. Get errors outside the lenient range for the mean predictions
mean_large_errors = mean_errors[mean_errors > 5]

# 5. Analyze and compare
print("Random Forest - Number of predictions outside lenient range:", len(large_errors))
print("Mean Prediction - Number of predictions outside lenient range:", len(mean_large_errors))

print("Random Forest - Average error for those predictions:", np.mean(large_errors))
print("Mean Prediction - Average error for those predictions:", np.mean(mean_large_errors))

# 6. (Optional) Visual comparison
plt.hist(large_errors, bins=20, alpha=0.5, label="Random Forest Errors")
plt.hist(mean_large_errors, bins=20, alpha=0.5, label="Mean Prediction Errors")
plt.xlabel("Error")
plt.ylabel("Frequency")
plt.title("Comparison of Errors Outside Lenient Range")
plt.legend()
plt.show()

In [25]:
import time
import cudf
from cuml.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from cuml.preprocessing import StandardScaler
import pandas as pd

data = pd.read_csv("diabetes.csv")
data = cudf.from_pandas(data)
X, y = data.drop(columns="Outcome"), data.Outcome
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

start_time = time.time()

param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
random_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5) 
random_search.fit(X_train.to_pandas(), y_train.to_pandas())  # Convert to pandas for GridSearchCV

end_time = time.time()

print("Best parameters:", random_search.best_params_)

# Convert back to cuDF for prediction
accuracy = random_search.score(X_test.to_pandas(), y_test.to_pandas())  
print(f"Accuracy: {accuracy:.2f}")

print(f"Time taken: {end_time - start_time:.2f} seconds")

Best parameters: {'C': 100, 'gamma': 0.001}
Accuracy: 0.77
Time taken: 9.41 seconds


In [20]:
import time
import cudf
from cuml.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from cuml.preprocessing import StandardScaler
import pandas as pd

data = pd.read_csv("diabetes.csv")
data = cudf.from_pandas(data)
X, y = data.drop(columns="Outcome"), data.Outcome
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

start_time = time.time()

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2']}
random_search = GridSearchCV(LinearSVC(), param_grid, cv=5) 
random_search.fit(X_train.to_cupy().get(), y_train.to_cupy().get()) # Convert to numpy for GridSearchCV

end_time = time.time()

print("Best parameters:", random_search.best_params_)

# Convert back to cuDF for prediction
accuracy = random_search.score(X_test.to_pandas(), y_test.to_pandas())  
print(f"Accuracy: {accuracy:.2f}")

print(f"Time taken: {end_time - start_time:.2f} seconds")

Best parameters: {'C': 0.1, 'penalty': 'l2'}
Accuracy: 0.75
Time taken: 1.77 seconds


In [19]:
import time
import cudf
from scipy.stats import uniform
from cuml.svm import SVC, LinearSVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split, GridSearchCV
from cuml.preprocessing import StandardScaler
import pandas as pd

data = pd.read_csv("diabetes.csv")
data = cudf.from_pandas(data)
X, y = data.drop(columns="Outcome"), data.Outcome
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

start_time = time.time()

param_dist = {'C': uniform(0.1, 1000)}
random_search = RandomizedSearchCV(LinearSVC(), param_dist, n_iter=350, cv=5) 
random_search.fit(X_train.to_cupy().get(), y_train.to_cupy().get()) # Convert to numpy for GridSearchCV

end_time = time.time()

print("Best parameters:", random_search.best_params_)

# Convert back to cuDF for prediction
accuracy = random_search.score(X_test.to_pandas(), y_test.to_pandas())  
print(f"Accuracy: {accuracy:.2f}")

print(f"Time taken: {end_time - start_time:.2f} seconds")

Best parameters: {'C': np.float64(822.009328781485)}
Accuracy: 0.74
Time taken: 17.45 seconds
