# MODEL TRAINING
We will create the model that will predict the **target variable** present in the data set. More than one model will be used to evaluate the result.

In [243]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [244]:
data = "/Users/leona/Desktop/Projeto 1/mammogram-result-prediction/data/processed/processed_data.csv"
df = pd.read_csv(data, sep=",")

In [245]:
feature_names = ["Age", "Shape", "Margin", "Density"]
feature_transformed = df[feature_names].values
target_transformed = df["Severity"].values

In [246]:
standard = StandardScaler()
feature_scaled = standard.fit_transform(feature_transformed)

In [247]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2

def create_neuralNet():
    model = Sequential()
    #4 feature inputs going into an 6-unit layer (more does not seem to help - in fact you can go down to 4)
    model.add(Dense(8, input_dim=4, kernel_initializer="normal", activation="relu"))
    # "Deep Learning" turns out to be unnecessary - this additional hidden layer doesn't help either.
    # model.add(Dense(8, kernel_initializer="normal", activation="relu"))
    # Output layer with a binary classification (benign or malignant)
    model.add(Dense(1, kernel_initializer="normal", activation="sigmoid"))
    # Compile model; rmsprop seemed to work best
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

In [248]:
from sklearn.model_selection import cross_val_score
from scikeras.wrappers import KerasClassifier

# Wrap out Keras model in an estimator compatible with scikit_learn
estimator = KerasClassifier(build_fn=create_neuralNet, epochs=100, verbose=0, batch_size=50)
# Now we can use scikit_learn's cross_val_score to evaluate this model identically to the others
cv_scores = cross_val_score(estimator, feature_scaled, target_transformed, cv=5, n_jobs=-1)
cv_scores.mean()

0.8038341903129563

In [249]:
from sklearn.ensemble import RandomForestClassifier

randF = RandomForestClassifier(n_estimators=100, criterion="entropy", max_depth=5, min_samples_split=5, n_jobs=-1)

cv = cross_val_score(randF, feature_scaled, target_transformed, cv=5, n_jobs=-1)
cv.mean()


0.8151018853551705

In [250]:
from sklearn.ensemble import BaggingClassifier

bagi1 = BaggingClassifier(estimator=RandomForestClassifier(n_estimators=100, criterion="entropy", max_depth=5, min_samples_split=5, n_jobs=-1),  n_estimators=20, max_samples=0.7, bootstrap=True, n_jobs=-1)

cv_ = cross_val_score(bagii, feature_scaled, target_transformed, cv=5, n_jobs=-1)
cv_.mean()

0.8162318288579952

In [251]:
from sklearn.ensemble import GradientBoostingClassifier
boost = GradientBoostingClassifier(loss="log_loss", learning_rate=0.1, n_estimators=100, subsample=1, min_samples_split=2, max_depth=2)

cv_1 = cross_val_score(boost, feature_scaled, target_transformed, cv=5, n_jobs=-1)
cv_1.mean()

0.8162254808607885

In [252]:
from sklearn.ensemble import BaggingClassifier

bagi2 = BaggingClassifier(estimator=GradientBoostingClassifier(loss="log_loss", learning_rate=0.1, n_estimators=100, subsample=1, min_samples_split=2, max_depth=2),  n_estimators=20, max_samples=0.7, bootstrap=True, n_jobs=-1)

cv_ = cross_val_score(bagi2, feature_scaled, target_transformed, cv=5, n_jobs=-1)
cv_.mean()

0.8072113248270171

In [253]:
from sklearn.svm import SVC

svc = SVC(C=1, kernel="rbf", gamma="scale")

cv_2 = cross_val_score(svc, feature_scaled, target_transformed, cv=5, n_jobs=-1)
cv_2.mean()

0.8105948073382848

In [254]:
from sklearn.ensemble import BaggingClassifier

bagi3 = BaggingClassifier(SVC(C=1, kernel="rbf", gamma="scale"),  n_estimators=20, max_samples=0.7, bootstrap=True, n_jobs=-1)

cv_ = cross_val_score(bagi3, feature_scaled, target_transformed, cv=5, n_jobs=-1)
cv_.mean()

0.80384688630737