In [None]:
import warnings
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn import tree
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [None]:
# starting the program by loading the "loading_breast_cancer" dataset from sklearn
import finder

In [None]:
warnings.filterwarnings('ignore')

In [None]:
data = finder.data
data

In [None]:
print(f'Number of the outcome variable: {len(np.unique(data.target))}. Possible values: {np.unique(data.target)} Labels: {(data.target_names)}')

In [None]:
print(f'Observations: {data.data.shape[0]}, features: {data.feature_names.shape}')

In [None]:
# importing the transformed dataframe from finder 
df = finder.dataframe
df.head(3)

In [None]:
# making the columns as features and investigate at the same time whether they have multicollinearity or not (high correlation)
columns = df.columns.to_list()[:10]
columns

In [None]:
df[['mean area', 'mean symmetry']].corr().iloc[0,1]

In [None]:
# plt.figure(figsize=[16, 16])
# sns.heatmap(df[columns].corr(), annot=True, cmap = 'coolwarm', linewidths=2, linecolor='white')
# plt.title('Heatmap')
# plt.savefig('big_heatmap.png')
# plt.show()
# plt.clf()
# features = columns[:2]+columns[4:5]+columns[8:]

In [None]:
# importing the helper file
import helper

In [None]:
features = helper.corr_(df)

In [None]:
helper.custom_heat_(df, features)

In [None]:
# making X and y as predictor(df for the first run) and outcome(one D array) variables to the model
X = df[features]
y = data.target
type(X), type(y)

In [None]:
a, b = np.sum(y == 0), np.sum(y == 1)
print(f'Malingnant tumors number: {a}, benign tumors number: {b}')

In [None]:
print(f'The sample is imbalanced, the benign tumors (true positivity) rate is {b/(b+a):.2}')

In [None]:
min_class_size = min(a, b)
# At a maximum, there should be no more than the smallest class size divided by 10 number of features.
max_features = min_class_size / 10
print(f'Maximum features could be {round(max_features)}, the current number of predictor variables is {len(X.columns)}.'
      f'he statement that the sample is big enough is {max_features > X.columns.nunique()}')

In [None]:
# making a pairplot just for fun
helper.pair_(X)

In [None]:
X.head(3)

In [None]:
#np.mean(X['mean radius'])

In [None]:
# standardizing the X values by initalizing the StandardScaler then fit and transform the dataframe (X) back into a 2D array
scaler = StandardScaler()
scaler.fit(X)

In [None]:
X = scaler.transform(X)

In [None]:
X

In [None]:
np.mean(X[:,0])

In [None]:
type(X), type(y), X.shape, y.shape

In [None]:
# splitting the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=50)

In [None]:
c, d, e, f = np.sum(y_train == 1), np.sum(y_train == 0), np.sum(y_test == 1), np.sum(y_test == 0)
f'The train true positivity rate is {c/(c+d):.2}, the test true positivity rate is {e/(e+f):.2}'

In [None]:
# making a LogisticRegression model, fit the training X and y values and then predict y values with using test x values
lrm = LogisticRegression(penalty=None, fit_intercept=True, class_weight='balanced')
model = lrm.fit(X_train, y_train)

In [None]:
threshold = 0.9
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = np.where(y_pred_proba > threshold, 1, 0)
y_test, y_pred

In [None]:
true_positives = 0
true_negatives = 0
false_positives = 0
false_negatives = 0

for i in range(len(y_test)):
  if y_test[i] == 1 and y_pred[i] == 1:
    true_positives += 1
  elif y_test[i] == 0 and y_pred[i] == 0:
    true_negatives += 1
  elif y_test[i] == 0 and y_pred[i] == 1:
    false_positives += 1
  else:
    false_negatives +=1
print(true_positives, true_negatives, false_positives, false_negatives)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
helper.confusion_(model, X, y)

### Conclusion: 
``We can see that how using lower prediction threshold changes the results. If our goal is to minimalize the false negatives and the false positives at the same time in a way that their coefficient will be close to 1 that would be a good idea. With this solution we can avoid to mistreat people with malignant tumor while they have benign (False negatives, left bottom corner) or at least decrease the occurences of that, while controlling the false positives (minimalize the number of patitent who has malignant but we predicted benign) can increase the model precision rate which in this case is important. With this in mind the chosen threshold must be somewhere between 0.2 and 0.3 where the precision rate and the recall rate are the closest to each other.``

``This is a malignant-benign breast cancer prediction model based on those predictor features using the LogisticRegression algorithm and showing that the prediction threshold has to be chosen depending on the goals we achieve. If we want to save as many lives as possible we should maximize the precision score (as to minimize the FP ) not the number of False negatives. In this case, we should choose 0.6 as the threshold. If we want to maximalize both the recall and the precision scores at the same time we should choose 0.3 as the prediction threshold.``

In [None]:
helper.roc_(model, X, y)

In [None]:
scores_and_k = []
accuracies = []
for k in range(1, 101):
  classifier = KNeighborsClassifier(n_neighbors = k)
  classifier.fit(X_train, y_train)
  a = classifier.score(X_test, y_test)
  scores_and_k.append([a, k])
  accuracies.append(a)
print(max(scores_and_k))

In [None]:
k_list = list(range(1, 101))
plt.plot(k_list, accuracies)
plt.xlabel('Number of "k" nearest neighbors')
plt.ylabel('Validation Accuracy')
plt.title('Breast Cancer Classifier Accuracy')
plt.plot(max(scores_and_k)[1], max(scores_and_k)[0], '-*')
plt.show()
plt.clf()

In [None]:
dt = DecisionTreeClassifier(max_depth=3, ccp_alpha=0.01,criterion='gini')
dt.fit(X_train, y_train)

plt.figure(figsize=(20,12))
tree.plot_tree(dt, feature_names = features, max_depth=5, class_names = ['malignant', 'benign'], label='all', filled=True, rounded=True)
plt.tight_layout()
plt.show()