In [None]:
# import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
# import data from sklearn
data = load_breast_cancer()
data

In [None]:
# The outcome variable must be binary
f'The outcome variable is {np.unique(data.target)} meaning the tumor could be {np.unique(data.target_names)}'

In [None]:
# make a dataframe from the data
df = pd.DataFrame(data.data, columns = data.feature_names)
df.head(3)

In [None]:
# make the columns as features and investigate at the same time whether they have multicollinearity or not (high correlation)
columns = df.columns.to_list()[:10]
columns

In [None]:
plt.figure(figsize=[12, 6])
sns.heatmap(df[columns].corr(), annot=True, cmap = 'coolwarm', linewidths=2, linecolor='white')
plt.title('Heatmap')
plt.show()
plt.clf()

In [None]:
features = list(columns[:2]+columns[4:5]+columns[8:])

In [None]:
# make a heatmap in order to decide what features must be dropped to avoid multicollinearity
plt.figure(figsize=[7, 5])
sns.heatmap(df[features].corr(), annot=True, cmap="mako", linewidths=2, linecolor='white')
plt.title('Heatmap')
plt.show()
plt.clf()

In [None]:
# make X and y as predictor(df for the first run) and outcome(one D array) variables to the model
X = df[features]
y = data.target

In [None]:
min_class_size = min(np.sum(data.target == 0), np.sum(data.target == 1))
# At a maximum, there should be no more than the smallest class size divided by 10 number of features.
max_features = min_class_size / 10
f'''The maximum of the features is {max_features} and the number of predictor variables is {X.columns.nunique()
}, so the assumption that the sample must be big enough is {max_features > X.columns.nunique()}'''

In [None]:
# make a pairplot just for fun
sns.pairplot(X, kind='scatter')
plt.show()
plt.clf()

In [None]:
X.head(3)

In [None]:
# standardize the X values by initalizing the StandardScaler then fit and transform the dataframe (X) back into a 2D array
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [None]:
X

In [None]:
type(X), type(y), X.shape, y.shape

In [None]:
# split the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=25)

In [None]:
# make a LogisticRegression model, fit the training X and y values and then predict y values with using test x values
lrm = LogisticRegression(penalty=None, fit_intercept=True)
model = lrm.fit(X_train, y_train)
threshold = 0.25
y_pred = np.where(model.predict_proba(X_test)[:, 1] > threshold, 1, 0)
y_test, y_pred

In [None]:
# Evaluate the model by using from sklearn.metrics import confusion_matrix and accuracy_score
#print(f'The prediction threshold is: {threshold}')
#print(f'''The confusion matrix is below:\n\n{confusion_matrix(y_test, y_pred)}
#\nThis represents the true positives, false positives in the first row, then false negatives and true negatives in the second row.\n''')
#print(f'The model is {round(100*accuracy_score(y_test, y_pred))}% accurate')
#print(f'The model is {round(100*precision_score(y_test, y_pred))}% precise')
#print(f'The model recall ratio is {round(100*recall_score(y_test, y_pred))}%')

In [None]:
thresholds = [0.25, 0.5, 0.75]
for threshold in thresholds:
    y_pred = np.where(model.predict_proba(X_test)[:, 1] > threshold, 1, 0)
# Evaluate the model by using from sklearn.metrics import confusion_matrix and accuracy_score
    print(f'''The prediction threshold is: {threshold}
Confusion matrix:\n{confusion_matrix(y_test, y_pred)}\n
Model accuracy: {round(100*accuracy_score(y_test, y_pred))}%
Model precision: {round(100*precision_score(y_test, y_pred))}%
Model recall ratio: {round(100*recall_score(y_test, y_pred))}%\n\n''')
print('\nConfusion matrix represents true positives, false positives in the first row, and false negatives, true negatives in the second row.')

### Conclusion: 
`We can see that using lower prediction threshold means that we decrease not just the true positive count but also the number of false negatives.
It means that we made the model more sensitive in order to save lives and catch as much malignant cancer as possible. 
The precision of the model is secondary but it has not changed significantly.`