# Breast cancer wisconcin dataset
The goal of the dataset is to predict if the patient has cancer (benign or malignant) based on their characteristics.

Malignant: causes cancer and is dangerous, meaning it can grow and spread to other parts of the body.

Benign: it is not cancer, they only grow in one part of the body. They cannot appear or invade other parts of the body.

### 1. Importing the data and libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../data/data.csv")

### 2. EDA

In [None]:
df

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
print(df.columns)

In [None]:
df.describe().T # display the describe with a transpose that allows you to see all the columns

In [None]:
print("Total number of null/missing values in each column of the (wisconcin cancer dataset)\n", df.isnull().sum())

In [None]:
df.diagnosis.unique()

2.1 Data Cleaning

In [None]:
def dataCleaner(data):
    data.drop("Unnamed: 32", axis=1, inplace=True)
    data.drop("id", axis=1, inplace=True)
    
    # Since the model will not understand what (M,B) means, I will transform it into numerical values that represent them
    data["diagnosis"] = data["diagnosis"].map({"M":1, "B":0})
    

In [None]:
dataCleaner(df)

In [None]:
df

In [None]:
df["diagnosis"].value_counts() # malignant:1 , benign:0

2.2 Data Visualization

In [None]:
ax= sns.countplot(x="diagnosis", data= df)
ax.bar_label(ax.containers[0])
plt.title("Number of diagnosis per type")
plt.show()

In [None]:
corr= df.corr()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(corr, cmap="RdBu", vmin=-1, vmax=1, annot=True)
plt.title("Correlation heatmap")
plt.show()

In [None]:
# generate a matrix as a scatterplot with the columns "mean: _mean"
# create a list with the columns to graph
cols= ["diagnosis",
            "radius_mean",
            "texture_mean",
            "perimeter_mean",
            "area_mean",
            "smoothness_mean",
            "compactness_mean",
            "concavity_mean",
            "concave points_mean",
            "symmetry_mean",
            "fractal_dimension_mean"]

In [None]:
# We create a pairplot with each of the cols that has a hue in the diagnosis to see the
# correlation between each of the columns with the diagnosis
sns.pairplot(data=df[cols], hue="diagnosis")

note that there are nearly perfect linear patterns between radius, perimeter, and area.

these attributes give a clue that there is multicollinearity between these variables. (These are highly linearly related), another set of variables that possibly imply/show multicollinearity are: concavity, concave_points and compactness.

multicollinearity is a problem since it is a statistical concept in which several independent variables are correlated in a model. Two variables are considered to be perfectly collinear if their correlation coefficient is +/- 1.0. Multicollinearity between independent variables will result in less reliable statistical inferences.

We can solve this by removing the highly correlated predictors from the model, we can use Partial Least Squares Regression (PLS) or Principal Components Analysis, these are regression methods that cut the number of predictors to a smaller set of uncorrelated components.

In [None]:
# visualize the correlation matrix to understand this case
corr = corr.round(2) # round to 2 decimal places

# Mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# define the size of the figure
plt.figure(figsize=(20,20))

# draw the heatmap
sns.heatmap(corr, mask=mask, cmap="RdBu", vmin=-1, vmax= 1, center= 0,
            square=True, linewidth= .5, cbar_kws={"shrink":.5}, annot=True)

plt.tight_layout()

we can verify the presence of multicollinearity between some of the variables.

for example, the radius_mean column has a correlation of 1 and 0.99 with the perimeter_mean and area_mean columns, respectively.

This is because the 3 columns contain essentially the same information, which is the physical size of the observation (the cell).

therefore we should choose (1) of these 3 columns when we are going to perform future analysis.

Another place where multicollinearity is apparent is between the "mean" and "worst" columns, for example, the radius_mean column has a correlation of 0.97 with the radius_worst column.

There is also multicollinearity between the compactness, concavity and concave points attributes. therefore we can choose(1) from those columns.

I will choose compactness

In [None]:
# 1. we drop all the worst columns
cols = ["radius_worst",
            "texture_worst",
            "perimeter_worst",
            "area_worst",
            "smoothness_worst",
            "compactness_worst",
            "concavity_worst",
            "concave points_worst",
            "symmetry_worst",
            "fractal_dimension_worst"]

df = df.drop(cols, axis=1)

In [None]:
# 2. dropeamos todas las columnas relacionadas a los atributos de "perimeter" y "area"
cols = ["perimeter_mean", "perimeter_se", "area_mean", "area_se"]

df= df.drop(cols, axis=1)

In [None]:
# 3. and last we drop all the columns related to the attributes of "concavity" and "concave points"
cols = ["concavity_mean", "concavity_se", "concave points_mean", "concave points_se"]

df = df.drop(cols, axis=1)

In [None]:
# We check that the changes have taken effect and we keep the important columns
df.columns

In [None]:
# we create a heatmap again, with the new correlation matrix to see if there is any high correlation left
corr = df.corr().round(2)

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

plt.figure(figsize=(20,20))

sns.heatmap(corr, mask=mask, cmap="RdBu", vmin=-1, vmax= 1, center= 0,
            square=True, linewidth= .5, cbar_kws={"shrink":.5}, annot=True)

plt.tight_layout()

with this we have removed the multicollinearity and we can now create the machine learning model

### 3. Building the model

In [None]:
X = df.drop("diagnosis", axis=1)
y = df["diagnosis"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

In [None]:
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler() # make all values have a certain standard

In [None]:
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

#### 3.1 Finding the best model

Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

model1 = lr.fit(X_train, y_train)
prediction1 = model1.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, prediction1)
cm

In [None]:
sns.heatmap(cm, annot=True)

In [None]:
model1Accuracy = accuracy_score(y_test, prediction1)
print(f"The logistic regression model an accuracy of: {model1Accuracy}")

In [None]:
print("Classification report of logistic regression: \n", classification_report(y_test, prediction1))

Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc= DecisionTreeClassifier()

In [None]:
model2 = dtc.fit(X_train, y_train)
prediction2 = model2.predict(X_test)

In [None]:
cm2 = confusion_matrix(y_test, prediction2)
cm2

In [None]:
sns.heatmap(cm2, annot=True)

In [None]:
model2Accuracy = accuracy_score(y_test, prediction2)
print(f"The decision tree classifier model an accuracy of: {model2Accuracy}")

In [None]:
print("Classification report of decision tree classifer : \n", classification_report(y_test, prediction2))

Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [None]:
model3 = rfc.fit(X_train, y_train)
prediction3 = model3.predict(X_test)

In [None]:
cm3 = confusion_matrix(y_test, prediction3)
cm3

In [None]:
sns.heatmap(cm3, annot=True)

In [None]:
model3Accuracy = accuracy_score(y_test, prediction3)
print(f"The random forest model an accuracy of: {model3Accuracy}")

In [None]:
print("Classification report of random forest : \n", classification_report(y_test, prediction3))

In [None]:
from joblib import dump, load

In [None]:
# We save the model to use it whenever we want without having to train it more
# dump(model1, "Logistic-regression-model.joblib") # save the model and give the file a name along with the extension

In [None]:
trainedModel = load("Logistic-regression-model.joblib") # load the model that we had already trained

In [None]:
df.head()

In [None]:
# Let's test the model that we trained with data that we invented based on the columns
pred = trainedModel.predict([[22, 10, 0.1190, 0.3, 0.1855, 0.7790, 0.7723, 0.8669, 0.09776, 0.5932, 0.30015, 0.006193]])
pred

In [None]:
if pred == 1:
    print("The patient has malignant cancer")
else:
    print("The patient has benign cancer")