Random Forest Binary Classification - Predicting Cancer

Import necessary libraries:

Pandas - Data Manipulation & Analysis

Sklearn - Machine Learning

Seaborn - Data Visualization

Matplotlib - Data Visualization

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

Read data set into dataframe labeled "medicalData"

In [None]:
from google.colab import files
uploaded=files.upload()

In [None]:
medicalData = pd.read_csv('breast-cancer.csv')

How does our data look?

In [None]:
print(medicalData.info())

Lets drop any duplicates in the dataset

In [None]:
medicalData = medicalData.drop_duplicates()

In [None]:
print(medicalData.head())

Drop ID for proper analysis and model building

In [None]:
medicalData = medicalData.drop(columns = ['id'])

In [None]:
print(medicalData.head())

Lets fix the diagnosis column from M (malignant) to 1 and B (benign) to 0

In [None]:
medicalData['diagnosis'] = medicalData['diagnosis'].map({'M': 1, 'B': 0})

In [None]:
print(medicalData.head())

In [None]:
print(medicalData['diagnosis'].unique())
print(medicalData['diagnosis'].dtype)

In [None]:
print(medicalData.info())

Now, lets do some Explorative Data Analysis on the dataset

Manually counting the proportion of target cases in the whole data set

In [None]:
count=[]
for i in range(len(medicalData)):
    if medicalData['diagnosis'][i] == 1:
        count.append(1)
    else:
        count.append(0)

print(sum(count)/len(count))

Plotting distributions

In [None]:
medicalData.hist(figsize=(15, 15), bins=30, edgecolor='black')
plt.suptitle("Histograms")
plt.show()
num_cols = medicalData.select_dtypes(include=['number']).columns 

for col in num_cols:
    plt.figure(figsize=(4, 2))
    sns.histplot(medicalData[col], kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

Now lets prepare our data for training and testing

In [None]:
print(medicalData.columns)

Lets create the X and Y sets


X is the rows and columns of the predictors

Y is the rows and column of the target

In [None]:
X = medicalData.iloc[:,1:31].values
Y = medicalData.iloc[:,0].values

Use train_test_split() for model cross validation

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.2, stratify=Y, random_state=3141)

Fit the model using RandomForestClassifier() and the training set

In [None]:
rf_class = RandomForestClassifier()
rf_class.fit(X_train, Y_train)

Finding variable importance

In [None]:
var_names = pandas.DataFrame(['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'])
loss_reduction = pandas.DataFrame(rf_class.feature_importances_, columns = ['loss_reduction'])
var_importance = pandas.concat([var_names, loss_reduction], axis = 1)
var_importance = var_importance.sort_values('loss_reduction', axis = 0, ascending= False)
print(var_importance)

Now lets generate some metrics to check the performance of this basic model. 

First we generate accuracy

In [None]:
Y_pred = rf_class.predict(X_test)
Y_test = pandas.DataFrame(Y_test, columns = ['Cancer'])
Y_pred = pandas.DataFrame(Y_pred, columns = ['Prediction'])
df = pandas.concat([Y_test, Y_pred], axis = 1)
count = (df['Cancer'] == df['Prediction']).sum()
print(count/len(df))

Now lets produce a confusion matrix and visualize it using a heatmap

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(Y_test, Y_pred)
print(cm)

In [None]:
sns.heatmap(cm, annot = True, 
            fmt="d", 
            cmap="Blues", 
            xticklabels=['Pred 0', 'Pred 1'], 
            yticklabels=['True 0', 'True 1'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

Lastly, lets generate the entire set of metrics that can be used to evaluate and compare different models

In [None]:
tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred).ravel()

sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
accuracy = accuracy_score(Y_test, Y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Sensitivity (Recall): {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1-score: {f1:.4f}")