# Nathan Bush Q 5

In [36]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

import pandas as pd
import numpy as np

dataset = load_breast_cancer()
print(dataset.data.shape)
print(dataset.target.shape)
old_target_variable = dataset.target
dataset.target = 1 - old_target_variable


(569, 30)
(569,)


# 5a

In [37]:
#used LLM formate print
# Calculate mean and standard deviation for each metric
def print_scores(results):
    metrics = {
        "Accuracy": results["test_accuracy"],
        "F1 Score": results["test_f1"],
        "Precision": results["test_precision"],
        "Recall": results["test_recall"]
    }
    # Create a markdown table
    print("| Metric    | Mean       |")
    print("|-----------|------------|")
    for metric, values in metrics.items():
        mean = np.mean(values)
        print(f"| {metric:<9} | {mean:.4f} ")

In [38]:
# Create a DataFrame from the dataset
df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
# Extract the specified features
df_selected = df[['worst concave points', 'worst radius', 'worst texture']]
pipe = make_pipeline(StandardScaler(), LogisticRegression())
scorers = ("f1","precision","recall","accuracy")
results = cross_validate(pipe, df_selected, dataset.target, scoring=scorers ,cv=5)

In [39]:
print_scores(results)



| Metric    | Mean       |
|-----------|------------|
| Accuracy  | 0.9649 
| F1 Score  | 0.9522 
| Precision | 0.9621 
| Recall    | 0.9437 


# 5b

In [40]:
#scale data first for better LogisticRegression
pipe = make_pipeline(StandardScaler(), LogisticRegression())
results = cross_validate(pipe, df, dataset.target, scoring=scorers ,cv=5)
print_scores(results)

| Metric    | Mean       |
|-----------|------------|
| Accuracy  | 0.9807 
| F1 Score  | 0.9736 
| Precision | 0.9858 
| Recall    | 0.9622 


# 5c.

In [41]:
# Set up the grid search with cross-validation
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 3, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 20]
}

grid_search = GridSearchCV(
    estimator= DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring=scorers,
    refit='recall', 
    n_jobs=-1 # go faster
    )

# Fit the grid search to the training data
grid_search.fit(df, dataset.target)

# Get the best parameters
# Get the best parameters and scores
best_params = grid_search.best_params_
best_recall = grid_search.best_score_
# Print the results
print("Best Hyperparameters:")
for param, value in best_params.items():
    print(f"  {param}: {value}")
print(f"\nBest Recall Score: {best_recall:.4f}")


Best Hyperparameters:
  criterion: entropy
  max_depth: 2
  min_samples_split: 2

Best Recall Score: 0.9347


# 5d

In [42]:

param_grid = {
    'kneighborsclassifier__n_neighbors': [1, 3, 5, 8, 10, 15, 20, 50, 80], # change names for pipe normalize
    'kneighborsclassifier__weights': ['uniform', 'distance']
}
pipe = make_pipeline(StandardScaler(), KNeighborsClassifier()) # should normalize for knn

grid_search = GridSearchCV(
    estimator= pipe,
    param_grid=param_grid,
    cv=5,
    scoring=scorers,
    refit='recall', 
    n_jobs=-1 # go faster
    )

# Fit the grid search to the training data
grid_search.fit(df, dataset.target)

# Get the best parameters
# Get the best parameters and scores
best_params = grid_search.best_params_
best_recall = grid_search.best_score_
# Print the results
print("Best Hyperparameters:")
for param, value in best_params.items():
    print(f"  {param}: {value}")
print(f"\nBest Recall Score: {best_recall:.4f}")


Best Hyperparameters:
  kneighborsclassifier__n_neighbors: 5
  kneighborsclassifier__weights: uniform

Best Recall Score: 0.9340


# 5E
As we are dealing with cancer and the target data has 'Malignant'= 1, and 'Benign’=0 we want to make sure that we don't miss a cancerous tumor. If we miss a Malignant tumor (FN) someone may die. On the other hand FP are less costly, as futher testing would reveal the mistake. 

Recall is TP/(TP+FN), so it punishes FNs which is what we want. It does not punish FP but they cost less in this cass.

In other words by using recall as our metric we are training our predicters to reduce the chance of someones cancer going undiagnosed.



# 5F

In [43]:
# Set up the grid search with cross-validation
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 6],
    'min_samples_split': [2,3,5]
}
# use GradientBoostingClassifier
grid_search = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring=scorers,
    refit='recall',
    n_jobs=-1  # Use all available cores
)

# Fit the grid search to the training data
grid_search.fit(df, dataset.target)

# Get the best parameters and scores
best_params = grid_search.best_params_
best_recall = grid_search.best_score_

In [45]:
# Print the results
print("Best Hyperparameters:")
for param, value in best_params.items():
    print(f"  {param}: {value}")
print(f"\nBest Recall Score: {best_recall:.4f}")

Best Hyperparameters:
  learning_rate: 0.1
  max_depth: 5
  min_samples_split: 5
  n_estimators: 100

Best Recall Score: 0.9576


I can achieve significant gains compared to the simple logistic regression model in part a as to more complex model in part b did a bit better. In this case any small improvemnt would result in more lives saved so that numerically small improment is significant. As far as easy of use and perfermance logistic regression is the best of the models we used, so it is my prefered model