In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
dataset = './Prostate_Cancer.csv'
data = pd.read_csv(dataset)

print(data.shape)

data.head()
data.info()

(100, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 100 non-null    int64  
 1   diagnosis_result   100 non-null    object 
 2   radius             100 non-null    int64  
 3   texture            100 non-null    int64  
 4   perimeter          100 non-null    int64  
 5   area               100 non-null    int64  
 6   smoothness         100 non-null    float64
 7   compactness        100 non-null    float64
 8   symmetry           100 non-null    float64
 9   fractal_dimension  100 non-null    float64
dtypes: float64(4), int64(5), object(1)
memory usage: 7.9+ KB


In [3]:
# drop column 'id' from the dataset to avoid overfitting
data.drop(columns=['id'], axis=1, inplace=True)
data.head()

Unnamed: 0,diagnosis_result,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0,M,23,12,151,954,0.143,0.278,0.242,0.079
1,B,9,13,133,1326,0.143,0.079,0.181,0.057
2,M,21,27,130,1203,0.125,0.16,0.207,0.06
3,M,14,16,78,386,0.07,0.284,0.26,0.097
4,M,9,19,135,1297,0.141,0.133,0.181,0.059


In [5]:
import numpy as np
import pandas as pd

# Assuming df is your DataFrame containing the features and diagnosis result


def add_noise(data, noise_level=0.01):
    noisy_data = data.copy()
    for column in noisy_data.columns:
        if column != 'diagnosis_result':
            noise = np.random.normal(
                0, noise_level, size=noisy_data[column].shape)
            noisy_data[column] += noise
    return noisy_data


noisy_data = add_noise(data)
augmented_data = pd.concat([data, noisy_data])

In [6]:
def scale_data(data, scale_factor_range=(0.9, 1.1)):
    scaled_data = data.copy()
    for column in scaled_data.columns:
        if column != 'diagnosis_result':
            scale_factor = np.random.uniform(
                scale_factor_range[0], scale_factor_range[1])
            scaled_data[column] *= scale_factor
    return scaled_data


scaled_data = scale_data(data)
augmented_data = pd.concat([augmented_data, scaled_data])

In [7]:
from imblearn.over_sampling import SMOTE

features = data.drop('diagnosis_result', axis=1)
labels = data['diagnosis_result']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(features, labels)

smote_data = pd.DataFrame(X_resampled, columns=features.columns)
smote_data['diagnosis_result'] = y_resampled
augmented_data = pd.concat([augmented_data, smote_data])

In [8]:
def permute_data(data):
    permuted_data = data.copy()
    for column in permuted_data.columns:
        if column != 'diagnosis_result':
            permuted_data[column] = np.random.permutation(
                permuted_data[column].values)
    return permuted_data


permuted_data = permute_data(data)
augmented_data = pd.concat([augmented_data, permuted_data])

In [9]:
def bootstrap_data(data, n_samples=None):
    if n_samples is None:
        n_samples = len(data)
    bootstrap_sample = data.sample(n=n_samples, replace=True)
    return bootstrap_sample


bootstrap_data = bootstrap_data(data)
augmented_data = pd.concat([augmented_data, bootstrap_data])

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the augmented data
X = augmented_data.drop('diagnosis_result', axis=1)
y = augmented_data['diagnosis_result']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier (as an example)
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.96


In [11]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [16]:
from sklearn.tree import DecisionTreeClassifier

# Fine-tuned Decision Tree
dt_model = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=10,
    min_samples_split=4,
    min_samples_leaf=2
)
dt_model.fit(X_train, y_train)


# Evaluate the model
y_pred = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred)
print(f'Decision Tree Accuracy: {dt_accuracy}')

Decision Tree Accuracy: 0.896


In [17]:
# Train a Logistic Regression model
# lr_model = LogisticRegression(max_iter=1000)
# lr_model.fit(X_train, y_train)

#fine tuning
from sklearn.linear_model import LogisticRegression

# Fine-tuned Logistic Regression
lr_model = LogisticRegression(
    penalty='l2',
    C=0.1,
    solver='liblinear',
    max_iter=200
)
lr_model.fit(X_train, y_train)

# Evaluate the model
y_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred)
print(f'Logistic Regression Accuracy: {lr_accuracy}')

Logistic Regression Accuracy: 0.76


In [18]:
# # Train a Naive Bayes model
# nb_model = GaussianNB()
# nb_model.fit(X_train, y_train)
# fine tuning
from sklearn.naive_bayes import GaussianNB

# Fine-tuned Naive Bayes
nb_model = GaussianNB(var_smoothing=1e-9)
nb_model.fit(X_train, y_train)


# Evaluate the model
y_pred = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_pred)
print(f'Naive Bayes Accuracy: {nb_accuracy}')

Naive Bayes Accuracy: 0.784


In [15]:
print(f'Decision Tree Accuracy: {dt_accuracy}')
print(f'Logistic Regression Accuracy: {lr_accuracy}')
print(f'Naive Bayes Accuracy: {nb_accuracy}')

Decision Tree Accuracy: 0.888
Logistic Regression Accuracy: 0.792
Naive Bayes Accuracy: 0.784


In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Define the parameter grid for Decision Tree
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Decision Tree classifier
dt_model = DecisionTreeClassifier()

# Instantiate the grid search model
grid_search_dt = GridSearchCV(
    estimator=dt_model, param_grid=param_grid_dt, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search_dt.fit(X_train, y_train)

# Get the best parameters
best_params_dt = grid_search_dt.best_params_
print(f'Best parameters for Decision Tree: {best_params_dt}')

# Train with the best parameters
dt_model_tuned = DecisionTreeClassifier(**best_params_dt)
dt_model_tuned.fit(X_train, y_train)

# Evaluate the model
y_pred = dt_model_tuned.predict(X_test)
dt_accuracy_tuned = accuracy_score(y_test, y_pred)
print(f'Tuned Decision Tree Accuracy: {dt_accuracy_tuned}')

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters for Decision Tree: {'criterion': 'gini', 'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 2}
Tuned Decision Tree Accuracy: 0.896


In [25]:
from sklearn.linear_model import LogisticRegression

# Define the parameter grid for Logistic Regression
param_grid_lr = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300]
}

# Create a Logistic Regression classifier
lr_model = LogisticRegression()

# Instantiate the grid search model
grid_search_lr = GridSearchCV(
    estimator=lr_model, param_grid=param_grid_lr, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search_lr.fit(X_train, y_train)

# Get the best parameters
best_params_lr = grid_search_lr.best_params_
print(f'Best parameters for Logistic Regression: {best_params_lr}')

# Train with the best parameters
lr_model_tuned = LogisticRegression(**best_params_lr)
lr_model_tuned.fit(X_train, y_train)

# Evaluate the model
y_pred_lr = lr_model_tuned.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred_lr)
print(f'Logistic Regression Best Accuracy: {lr_accuracy}')

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best parameters for Logistic Regression: {'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Logistic Regression Best Accuracy: 0.808


In [26]:
from sklearn.naive_bayes import GaussianNB

# Define the parameter grid for Naive Bayes
param_grid_nb = {
    'var_smoothing': [1e-11, 1e-10, 1e-9, 1e-8, 1e-7]
}

# Create a Gaussian Naive Bayes classifier
nb_model = GaussianNB()

# Instantiate the grid search model
grid_search_nb = GridSearchCV(
    estimator=nb_model, param_grid=param_grid_nb, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search_nb.fit(X_train, y_train)

# Get the best parameters
best_params_nb = grid_search_nb.best_params_
print(f'Best parameters for Naive Bayes: {best_params_nb}')

# Train with the best parameters
nb_model_tuned = GaussianNB(**best_params_nb)
nb_model_tuned.fit(X_train, y_train)

# Evaluate the model
y_pred_nb = nb_model_tuned.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_pred_nb)
print(f'Naive Bayes Best Accuracy: {nb_accuracy}')

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best parameters for Naive Bayes: {'var_smoothing': 1e-07}
Naive Bayes Best Accuracy: 0.808


In [27]:
print(f'Decision Tree Best Accuracy: {dt_accuracy}')
print(f'Logistic Regression Best Accuracy: {lr_accuracy}')
print(f'Naive Bayes Best Accuracy: {nb_accuracy}')

Decision Tree Best Accuracy: 0.896
Logistic Regression Best Accuracy: 0.808
Naive Bayes Best Accuracy: 0.808


In [28]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

# Example of a more advanced pipeline with feature engineering and ensemble methods

# Create a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features
    # Add polynomial features
    ('poly', PolynomialFeatures(degree=2, interaction_only=True)),
    ('pca', PCA(n_components=10)),  # Dimensionality reduction
    ('feature_selection', SelectFromModel(
        RandomForestClassifier(n_estimators=100))),  # Feature selection
    ('ensemble', StackingClassifier(  # Ensemble method
        estimators=[
            ('rf', RandomForestClassifier(n_estimators=100)),
            ('gb', GradientBoostingClassifier(n_estimators=100)),
            ('lr', LogisticRegression())
        ],
        final_estimator=LogisticRegression()
    ))
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Evaluate the pipeline
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Ensemble Pipeline Accuracy: {accuracy}')

Ensemble Pipeline Accuracy: 0.832


In [None]:
! pip install optuna

In [30]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

# Define the objective function for Decision Tree


def objective_dt(trial):
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 16)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16)

    dt_model = DecisionTreeClassifier(
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf
    )

    score = cross_val_score(dt_model, X_train, y_train,
                            cv=5, scoring='accuracy').mean()
    return score


# Create the study and optimize
study_dt = optuna.create_study(direction='maximize')
study_dt.optimize(objective_dt, n_trials=50)

# Get the best parameters and train the model
best_params_dt = study_dt.best_params
dt_model_tuned = DecisionTreeClassifier(**best_params_dt)
dt_model_tuned.fit(X_train, y_train)

# Evaluate the model
y_pred_dt = dt_model_tuned.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f'Decision Tree Best Accuracy: {dt_accuracy}')
print(f'Best parameters for Decision Tree: {best_params_dt}')

[I 2024-06-22 12:33:25,592] A new study created in memory with name: no-name-b27ee6fe-c711-4cea-a719-aff9173fd9f5
[I 2024-06-22 12:33:25,616] Trial 0 finished with value: 0.8497171717171718 and parameters: {'criterion': 'gini', 'max_depth': 24, 'min_samples_split': 14, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.8497171717171718.
[I 2024-06-22 12:33:25,642] Trial 1 finished with value: 0.8595959595959595 and parameters: {'criterion': 'gini', 'max_depth': 25, 'min_samples_split': 16, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.8595959595959595.
[I 2024-06-22 12:33:25,664] Trial 2 finished with value: 0.8495757575757577 and parameters: {'criterion': 'gini', 'max_depth': 27, 'min_samples_split': 5, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.8595959595959595.
[I 2024-06-22 12:33:25,690] Trial 3 finished with value: 0.8516767676767676 and parameters: {'criterion': 'entropy', 'max_depth': 17, 'min_samples_split': 15, 'min_samples_leaf': 1}. Best is trial 1 with

Decision Tree Best Accuracy: 0.912
Best parameters for Decision Tree: {'criterion': 'gini', 'max_depth': 19, 'min_samples_split': 2, 'min_samples_leaf': 3}


In [33]:
from sklearn.naive_bayes import GaussianNB

# Define the objective function for Naive Bayes


def objective_nb(trial):
    var_smoothing = trial.suggest_loguniform('var_smoothing', 1e-11, 1e-7)

    nb_model = GaussianNB(var_smoothing=var_smoothing)

    score = cross_val_score(nb_model, X_train, y_train,
                            cv=5, scoring='accuracy').mean()
    return score


# Create the study and optimize
study_nb = optuna.create_study(direction='maximize')
study_nb.optimize(objective_nb, n_trials=50)

# Get the best parameters and train the model
best_params_nb = study_nb.best_params
nb_model_tuned = GaussianNB(**best_params_nb)
nb_model_tuned.fit(X_train, y_train)

# Evaluate the model
y_pred_nb = nb_model_tuned.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_pred_nb)
print(f'Naive Bayes Best Accuracy: {nb_accuracy}')
print(f'Best parameters for Naive Bayes: {best_params_nb}')

[I 2024-06-22 12:35:09,569] A new study created in memory with name: no-name-75524e48-db6a-41f8-bc21-f843e2d920e9
  var_smoothing = trial.suggest_loguniform('var_smoothing', 1e-11, 1e-7)
[I 2024-06-22 12:35:09,591] Trial 0 finished with value: 0.7734343434343434 and parameters: {'var_smoothing': 7.896696651106577e-09}. Best is trial 0 with value: 0.7734343434343434.
  var_smoothing = trial.suggest_loguniform('var_smoothing', 1e-11, 1e-7)
[I 2024-06-22 12:35:09,622] Trial 1 finished with value: 0.7553737373737374 and parameters: {'var_smoothing': 1.6966468982792169e-10}. Best is trial 0 with value: 0.7734343434343434.
  var_smoothing = trial.suggest_loguniform('var_smoothing', 1e-11, 1e-7)
[I 2024-06-22 12:35:09,646] Trial 2 finished with value: 0.7593939393939394 and parameters: {'var_smoothing': 5.4818045366903406e-11}. Best is trial 0 with value: 0.7734343434343434.
  var_smoothing = trial.suggest_loguniform('var_smoothing', 1e-11, 1e-7)
[I 2024-06-22 12:35:09,661] Trial 3 finished w

Naive Bayes Best Accuracy: 0.824
Best parameters for Naive Bayes: {'var_smoothing': 3.059660294620417e-08}


In [35]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Define the objective function for Decision Tree


def objective_dt(trial):
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 16)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16)

    dt_model = DecisionTreeClassifier(
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf
    )

    score = cross_val_score(dt_model, X_train, y_train,
                            cv=5, scoring='accuracy').mean()
    return score


# Create the study and optimize
study_dt = optuna.create_study(direction='maximize')
study_dt.optimize(objective_dt, n_trials=50)

# Get the best parameters and train the model
best_params_dt = study_dt.best_params
print(f'Best parameters for Decision Tree: {best_params_dt}')

dt_model_tuned = DecisionTreeClassifier(**best_params_dt)
dt_model_tuned.fit(X_train, y_train)

# Evaluate the model
y_pred_dt = dt_model_tuned.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f'Decision Tree Best Accuracy: {dt_accuracy}')

[I 2024-06-22 12:37:10,635] A new study created in memory with name: no-name-10fd6368-fcac-472f-a703-4c295dc2868c
[I 2024-06-22 12:37:10,660] Trial 0 finished with value: 0.8275555555555556 and parameters: {'criterion': 'entropy', 'max_depth': 31, 'min_samples_split': 12, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.8275555555555556.
[I 2024-06-22 12:37:10,683] Trial 1 finished with value: 0.8355353535353535 and parameters: {'criterion': 'entropy', 'max_depth': 11, 'min_samples_split': 10, 'min_samples_leaf': 13}. Best is trial 1 with value: 0.8355353535353535.
[I 2024-06-22 12:37:10,718] Trial 2 finished with value: 0.8355555555555556 and parameters: {'criterion': 'gini', 'max_depth': 13, 'min_samples_split': 14, 'min_samples_leaf': 8}. Best is trial 2 with value: 0.8355555555555556.
[I 2024-06-22 12:37:10,741] Trial 3 finished with value: 0.8175151515151515 and parameters: {'criterion': 'entropy', 'max_depth': 28, 'min_samples_split': 6, 'min_samples_leaf': 15}. Best is tri

Best parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1}
Decision Tree Best Accuracy: 0.904


In [37]:
from sklearn.linear_model import LogisticRegression

# Define the objective function for Logistic Regression


def objective_lr(trial):
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    C = trial.suggest_loguniform('C', 1e-4, 1e2)
    solver = trial.suggest_categorical(
        'solver', ['liblinear', 'lbfgs', 'saga'])
    max_iter = trial.suggest_int('max_iter', 100, 300)

    # Ensure solver compatibility with penalty
    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        raise optuna.exceptions.TrialPruned()
    if penalty == 'l2' and solver == 'liblinear':
        raise optuna.exceptions.TrialPruned()

    lr_model = LogisticRegression(
        penalty=penalty,
        C=C,
        solver=solver,
        max_iter=max_iter
    )

    score = cross_val_score(lr_model, X_train, y_train,
                            cv=5, scoring='accuracy').mean()
    return score


# Create the study and optimize
study_lr = optuna.create_study(direction='maximize')
study_lr.optimize(objective_lr, n_trials=50)

# Get the best parameters and train the model
best_params_lr = study_lr.best_params
print(f'Best parameters for Logistic Regression: {best_params_lr}')

lr_model_tuned = LogisticRegression(**best_params_lr)
lr_model_tuned.fit(X_train, y_train)

# Evaluate the model
y_pred_lr = lr_model_tuned.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred_lr)
print(f'Logistic Regression Best Accuracy: {lr_accuracy}')

[I 2024-06-22 12:37:38,306] A new study created in memory with name: no-name-c1cacf83-cdc7-4ffd-bf80-e91779a8393a
  C = trial.suggest_loguniform('C', 1e-4, 1e2)
[I 2024-06-22 12:37:38,339] Trial 0 finished with value: 0.6012121212121212 and parameters: {'penalty': 'l1', 'C': 0.001267317053641059, 'solver': 'liblinear', 'max_iter': 164}. Best is trial 0 with value: 0.6012121212121212.
  C = trial.suggest_loguniform('C', 1e-4, 1e2)
[I 2024-06-22 12:37:38,341] Trial 1 pruned. 
[I 2024-06-22 12:37:38,405] Trial 2 finished with value: 0.8095151515151515 and parameters: {'penalty': 'l2', 'C': 0.5687336025161197, 'solver': 'lbfgs', 'max_iter': 246}. Best is trial 2 with value: 0.8095151515151515.
  C = trial.suggest_loguniform('C', 1e-4, 1e2)
[I 2024-06-22 12:37:38,406] Trial 3 pruned. 
[I 2024-06-22 12:37:38,453] Trial 4 finished with value: 0.7955151515151515 and parameters: {'penalty': 'l2', 'C': 0.0006605068947361623, 'solver': 'lbfgs', 'max_iter': 168}. Best is trial 2 with value: 0.8095

Best parameters for Logistic Regression: {'penalty': 'l1', 'C': 0.9374500448076619, 'solver': 'liblinear', 'max_iter': 260}
Logistic Regression Best Accuracy: 0.808


In [39]:
from sklearn.naive_bayes import GaussianNB

# Define the objective function for Naive Bayes


def objective_nb(trial):
    var_smoothing = trial.suggest_loguniform('var_smoothing', 1e-11, 1e-7)

    nb_model = GaussianNB(var_smoothing=var_smoothing)

    score = cross_val_score(nb_model, X_train, y_train,
                            cv=5, scoring='accuracy').mean()
    return score


# Create the study and optimize
study_nb = optuna.create_study(direction='maximize')
study_nb.optimize(objective_nb, n_trials=50)

# Get the best parameters and train the model
best_params_nb = study_nb.best_params
print(f'Best parameters for Naive Bayes: {best_params_nb}')

nb_model_tuned = GaussianNB(**best_params_nb)
nb_model_tuned.fit(X_train, y_train)

# Evaluate the model
y_pred_nb = nb_model_tuned.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_pred_nb)
print(f'Naive Bayes Best Accuracy: {nb_accuracy}')

[I 2024-06-22 12:38:02,000] A new study created in memory with name: no-name-28329521-285d-4680-9462-2930af4dc21f
  var_smoothing = trial.suggest_loguniform('var_smoothing', 1e-11, 1e-7)
[I 2024-06-22 12:38:02,020] Trial 0 finished with value: 0.7994141414141415 and parameters: {'var_smoothing': 4.6205758423348436e-08}. Best is trial 0 with value: 0.7994141414141415.
  var_smoothing = trial.suggest_loguniform('var_smoothing', 1e-11, 1e-7)
[I 2024-06-22 12:38:02,029] Trial 1 finished with value: 0.7593939393939394 and parameters: {'var_smoothing': 7.601218288453327e-11}. Best is trial 0 with value: 0.7994141414141415.
  var_smoothing = trial.suggest_loguniform('var_smoothing', 1e-11, 1e-7)
[I 2024-06-22 12:38:02,055] Trial 2 finished with value: 0.7553737373737374 and parameters: {'var_smoothing': 1.348894712760798e-10}. Best is trial 0 with value: 0.7994141414141415.
  var_smoothing = trial.suggest_loguniform('var_smoothing', 1e-11, 1e-7)
[I 2024-06-22 12:38:02,061] Trial 3 finished wi

Best parameters for Naive Bayes: {'var_smoothing': 3.172217878276516e-08}
Naive Bayes Best Accuracy: 0.824


In [40]:
! pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.1.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.0-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 217.9 kB/s eta 0:09:34
   ---------------------------------------- 0.0/124.9 MB 217.9 kB/s eta 0:09:34
   ---------------------------------------- 0.1/124.9 MB 297.7 kB/s eta 0:07:00
   ---------------------------------------- 0.1/124.9 MB 353.1 kB/s eta 0:05:54
   ---------------------------------------- 0.1/124.9 MB 425.1 kB/s eta 0:04:54
   ---------------------------------------- 0.2/124.9 MB 517.2 kB/s eta 0:04:02
   ---------------------------------------- 0.2/124.9 MB 625.1 kB/s eta 0:03:20
 

In [41]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data
X = augmented_data.drop('diagnosis_result', axis=1)
y = augmented_data['diagnosis_result']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(
    n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train, y_train)

# Evaluate the model
y_pred_gb = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
print(f'Gradient Boosting Accuracy: {gb_accuracy}')

Gradient Boosting Accuracy: 0.944


In [42]:
from sklearn.ensemble import AdaBoostClassifier

# AdaBoost Classifier
ab_model = AdaBoostClassifier(
    n_estimators=100, learning_rate=0.1, random_state=42)
ab_model.fit(X_train, y_train)

# Evaluate the model
y_pred_ab = ab_model.predict(X_test)
ab_accuracy = accuracy_score(y_test, y_pred_ab)
print(f'AdaBoost Accuracy: {ab_accuracy}')

AdaBoost Accuracy: 0.848


In [44]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode target variable
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(
    y_test)  # Use transform for consistency

# Check the mapping of labels
print("Encoded classes:", label_encoder.classes_)
# Output: Encoded classes: ['B' 'M']

Encoded classes: ['B' 'M']


In [45]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3,random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train_encoded)

# Evaluate the model
y_pred_xgb = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test_encoded, y_pred_xgb)
print(f'XGBoost Accuracy: {xgb_accuracy}')

XGBoost Accuracy: 0.952


Parameters: { "use_label_encoder" } are not used.



In [46]:
print(f'Gradient Boosting Accuracy: {gb_accuracy}')
print(f'AdaBoost Accuracy: {ab_accuracy}')
print(f'XGBoost Accuracy: {xgb_accuracy}')

Gradient Boosting Accuracy: 0.944
AdaBoost Accuracy: 0.848
XGBoost Accuracy: 0.952


In [50]:
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data
X = augmented_data.drop('diagnosis_result', axis=1)
y = augmented_data['diagnosis_result']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(
    n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)

# AdaBoost Classifier
ab_model = AdaBoostClassifier(
    n_estimators=100, learning_rate=0.1, random_state=42)
ab_model.fit(X_train, y_train)
y_pred_ab = ab_model.predict(X_test)
ab_accuracy = accuracy_score(y_test, y_pred_ab)

# XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3,random_state=42, use_label_encoder=False, eval_metric='mlogloss')
# # xgb_model.fit(X_train, y_train)
# y_pred_xgb = xgb_model.predict(X_test)
# xgb_accuracy = accuracy_score(y_test, y_pred_xgb)

# Summary of Boosting Techniques Accuracies
print(f'Gradient Boosting Accuracy: {gb_accuracy}')
print(f'AdaBoost Accuracy: {ab_accuracy}')
# print(f'XGBoost Accuracy: {xgb_accuracy}')

Gradient Boosting Accuracy: 0.944
AdaBoost Accuracy: 0.848


In [51]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Assuming you have already split your data into X_train, X_test, y_train, y_test

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize MLPClassifier
mlp_model = MLPClassifier(hidden_layer_sizes=(
    100, 50), max_iter=500, random_state=42)

# Train the model
mlp_model.fit(X_train_scaled, y_train)

# Predict on test data
y_pred_mlp = mlp_model.predict(X_test_scaled)

# Evaluate the model
mlp_accuracy = accuracy_score(y_test, y_pred_mlp)
print(f'MLP Accuracy: {mlp_accuracy}')

MLP Accuracy: 0.944


In [52]:
# Assuming you have imported necessary libraries and defined your data as earlier

# Initialize and optimize other models (Decision Tree, Logistic Regression, Naive Bayes, Boosting)
# ...

# Train and evaluate MLP
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

mlp_model = MLPClassifier(hidden_layer_sizes=(
    100, 50), max_iter=500, random_state=42)
mlp_model.fit(X_train_scaled, y_train)
y_pred_mlp = mlp_model.predict(X_test_scaled)
mlp_accuracy = accuracy_score(y_test, y_pred_mlp)

print(f'MLP Accuracy: {mlp_accuracy}')

# Print accuracies of other models
# ...

MLP Accuracy: 0.944
