In [1]:
# Installed pickle5 to be able to save and load our chosen ML model and scaler:
!pip install pickle5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Imported all the libraries we used:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, classification_report
import warnings
import pickle5

# Ignored warning messages:
warnings.filterwarnings('ignore')

In [3]:
# Read the data into a Pandas DataFrame:
data_df = pd.read_csv("../Resources/data.csv")

In [4]:
# Displayed the DataFrame to make sure it was loaded correctly:
data_df

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,


In [5]:
# Displayed the name of columns to facilitate the data cleansing:
data_df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')

# DATA CLEANSING:


In [6]:
# Created a dictionary to code the target variable with string values and replaced them in the DataFrame:
diagnosis_dict = {'B': 0, 'M': 1}
data_df['diagnosis'] = data_df['diagnosis'].replace(diagnosis_dict)

# Renamed the following columns using 'snake case':
data_df.rename(columns={
    'concave points_mean': 'concave_points_mean',
    'concave points_se': 'concave_points_se',
    'concave points_worst': 'concave_points_worst'
}, inplace=True)

# Checked if the column 'Unnamed: 32' existed before dropping it:
if 'Unnamed: 32' in data_df.columns:
    data_df = data_df.drop(columns='Unnamed: 32')

# Exported a clean CSV file:
data_df.to_csv('../Resources/data_clean.csv', index=False)

# Printed the updated DataFrame:
data_df

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,842302,1,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,1,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,1,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,1,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,1,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,1,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,1,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,1,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [7]:
# Checked for missing values:
print("Missing values:")
data_df.isnull().sum()

Missing values:


id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave_points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave_points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave_points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [8]:
# Dropped the unnecessary 'id' column:
data_df = data_df.drop(columns='id')

In [9]:
# Defined the features and target variables:
X = data_df.drop(columns='diagnosis')
y = data_df['diagnosis']

In [10]:
# Checked the balance of the two classifications in the target variable:
print(y.value_counts())
print('Percentages:')
print(((y.value_counts())/y.count())*100)

0    357
1    212
Name: diagnosis, dtype: int64
Percentages:
0    62.741652
1    37.258348
Name: diagnosis, dtype: float64


In [11]:
# Reshaped the target variable to better fit the models used:
y = y.values.reshape(-1,1)
y[:5]

array([[1],
       [1],
       [1],
       [1],
       [1]])

# FIRST MODEL TESTED: Logistic Regression

In [12]:
# Split the data into training and testing datasets:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Created a logistic regression model:
model = LogisticRegression()

# Fitted the model to the training data:
model.fit(X_train, y_train)

# Made predictions on the test data:
y_pred = model.predict(X_test)

# Calculated the accuracy of the model:
accuracy = accuracy_score(y_test, y_pred)

# Printed the accuracy score and the classification report:
print("Accuracy:", accuracy)
print('Classification report')
print(classification_report(y_test, y_pred))

Accuracy: 0.965034965034965
Classification report
              precision    recall  f1-score   support

           0       0.97      0.98      0.97        89
           1       0.96      0.94      0.95        54

    accuracy                           0.97       143
   macro avg       0.96      0.96      0.96       143
weighted avg       0.97      0.97      0.96       143



# SECOND MODEL TESTED: Random Forests

In [13]:
# Split the data into training and testing:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=54403)

In [14]:
# Created the 'StandardScaler' instance:
scaler = StandardScaler()

# Fitted the 'StandardScaler' with the training data:
X_scaler = scaler.fit(X_train)

# Scaled the training and testing data:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
# Created the random forest classifier instance:
rf_model = RandomForestClassifier(n_estimators=500, random_state=54403)

# Fitted the model and used the 'ravel()' function to fit the original array:
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [16]:
# Made predictions using the testing data:
predictions = rf_model.predict(X_test_scaled)
predictions

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1])

In [17]:
# Calculated the confusion matrix:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1']
)

# Calculated the accuracy score:
acc_score = accuracy_score(y_test, predictions)

In [18]:
# Displayed the confusion matrix, the accuracy score, and the classification report:
print('Confusion matrix')
display(cm_df)
print(f"Accuracy score: {acc_score}")
print('Classification report')
print(classification_report(y_test, predictions))

Confusion matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,92,2
Actual 1,1,48


Accuracy score: 0.9790209790209791
Classification report
              precision    recall  f1-score   support

           0       0.99      0.98      0.98        94
           1       0.96      0.98      0.97        49

    accuracy                           0.98       143
   macro avg       0.97      0.98      0.98       143
weighted avg       0.98      0.98      0.98       143



In [19]:
# Got the feature importance array:
importances = rf_model.feature_importances_

# Listed the top 10 most important features:
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.14154185305648398, 'concave_points_worst'),
 (0.12114750823038828, 'concave_points_mean'),
 (0.10737664326249423, 'radius_worst'),
 (0.1006147865643346, 'perimeter_worst'),
 (0.08714858736005753, 'area_worst'),
 (0.0654743163758934, 'concavity_mean'),
 (0.04710286198836025, 'area_mean'),
 (0.04398252259361993, 'area_se'),
 (0.038741097804306276, 'radius_mean'),
 (0.03682499584441087, 'concavity_worst')]

## It turns out that the Random Forests model works better than Logistic Regression with our data:

In summary, our Random Forest model is more accurate and precise. It works better to classify both 'benign tumors' and 'malignant tumors'. The model is as accurate as it can be for classifying 'benign tumors' and very accurate for 'malignant tumors'. The positive or benign classifications made by the model are generally very likely correct because the precision score is very high —higher than the logistic regression scores—, and the model correctly finds nearly all the true 'malignant tumors' that there are, as the recall scores of those type of tumors was extremely high too: 98%.

The *recall* score calculates how many of the actual 'malignant tumors' the model capture as 'malignant tumors'. Hence, the higher the score, the lower the costs of mis-classifying that type of tumors. This score is extremely important in our situation because the costs of mis-classifying a 'malignant tumor' as a 'benign tumor' are extremely high. Our model could misdiagnose a patient with cancer as a 'healthy patient' and recommend sending them home, which could bring unimaginably undesirable consequences. Fortunately, the recall score of 'malignant tumors' in our model was 98%, which is extremely high. That is very desirable because the model correctly identified nearly all the 'malignant tumors'. Therefore, the probabilities of mis-classifying this type of tumors is extremely low.

# SAVING OUR CHOSEN MODEL AND SCALER

In [20]:
# Specified the filename and path we wanted to save the Random Forests model:
model_filename = '../ML_model/rf_model.pkl'

# Used the pickle.dump() method to save the model:
with open(model_filename, 'wb') as file:
    pickle5.dump(rf_model, file)


# Specified the filename and path where we wanted to save the scaler:
scaler_filename = '../ML_model/rf_scaler.pkl'

# Used the pickle.dump() method to save the scaler:
with open(scaler_filename, 'wb') as file:
    pickle5.dump(X_scaler, file)

# TESTING OUR SAVED MODEL AND SCALER

In [21]:
# Tested the saved model and scaler with an observation that should be predicted as 1 or 'malignant':
input_radius_mean = 19.69
input_texture_mean = 21.25
input_perimeter_mean = 130.0
input_area_mean = 1203.0
input_smoothness_mean = 0.1096
input_compactness_mean = 0.1599
input_concavity_mean = 0.1974
input_concave_points_mean = 0.1279
input_symmetry_mean = 0.2069
input_fractal_dimension_mean = 0.05999
input_radius_se = 0.7456
input_texture_se = 0.7869
input_perimeter_se = 4.585
input_area_se = 94.03
input_smoothness_se = 0.00615
input_compactness_se = 0.04006
input_concavity_se = 0.03832
input_concave_points_se = 0.02058
input_symmetry_se = 0.0225
input_fractal_dimension_se = 0.004571
input_radius_worst = 23.57
input_texture_worst = 25.53
input_perimeter_worst = 152.5
input_area_worst = 1709.0
input_smoothness_worst = 0.1444
input_compactness_worst = 0.4245
input_concavity_worst = 0.4504
input_concave_points_worst = 0.243
input_symmetry_worst = 0.3613
input_fractal_dimension_worst = 0.08758

# Created a dictionary with all the testing observations and appropriately casted each variable:
input_d = {0: float(input_radius_mean),
           1: float(input_texture_mean),
           2: float(input_perimeter_mean),
           3: float(input_area_mean),
           4: float(input_smoothness_mean),
           5: float(input_compactness_mean),
           6: float(input_concavity_mean),
           7: float(input_concave_points_mean),
           8: float(input_symmetry_mean),
           9: float(input_fractal_dimension_mean),
           10: float(input_radius_se),
           11: float(input_texture_se),
           12: float(input_perimeter_se),
           13: float(input_area_se),
           14: float(input_smoothness_se),
           15: float(input_compactness_se),
           16: float(input_concavity_se),
           17: float(input_concave_points_se),
           18: float(input_symmetry_se),
           19: float(input_fractal_dimension_se),
           20: float(input_radius_worst),
           21: float(input_texture_worst),
           22: float(input_perimeter_worst),
           23: float(input_area_worst),
           24: float(input_smoothness_worst),
           25: float(input_compactness_worst),
           26: float(input_concavity_worst),
           27: float(input_concave_points_worst),
           28: float(input_symmetry_worst),
           29: float(input_fractal_dimension_worst)}

# Converted the dictionary into a Pandas DataFrame, so it fitted what the scaler expected:
input_df = pd.DataFrame(input_d, index=[0])

# Loaded the saved scaler:
scaler_filename = '../ML_model/rf_scaler.pkl'
with open(scaler_filename, 'rb') as file1:
    loaded_scaler = pickle5.load(file1)

# Transformed the DataFrame with the scaler:
X_transformed = loaded_scaler.transform(input_df)

# Loaded the saved model:
model_filename = '../ML_model/rf_model.pkl'  # Adjust the file path accordingly
with open(model_filename, 'rb') as file2:
    loaded_model = pickle5.load(file2)

# Made the prediction and confirmed our saved model and scaler work:
prediction = loaded_model.predict(X_transformed)
prediction

array([1])