In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
data = pd.read_csv('./Prostate_Cancer.csv')

print(data.shape)

data.head()

(100, 10)


Unnamed: 0,id,diagnosis_result,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0,1,M,23,12,151,954,0.143,0.278,0.242,0.079
1,2,B,9,13,133,1326,0.143,0.079,0.181,0.057
2,3,M,21,27,130,1203,0.125,0.16,0.207,0.06
3,4,M,14,16,78,386,0.07,0.284,0.26,0.097
4,5,M,9,19,135,1297,0.141,0.133,0.181,0.059


In [2]:
# Importing the train_test_split function from sklearn
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=122)
print('Training data: ', train.shape)
print('Test data: ', test.shape)
# Splitting the data into train and test
Xtrain = train.drop(columns=['diagnosis_result'], axis=1)
ytrain = train['diagnosis_result']

Xtest = test.drop(columns=['diagnosis_result'], axis=1)
ytest = test['diagnosis_result']

Training data:  (80, 10)
Test data:  (20, 10)


In [3]:
# 2nd: Random Forest Classification
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

forest = RandomForestClassifier(random_state=122)

param_grid = {
    'n_estimators': [100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [2, 4],
    'max_features': ['sqrt', 'log2']
}

grid_search_forest = GridSearchCV(
    forest, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search_forest.fit(Xtrain, ytrain)

print('Best Parameters: ', grid_search_forest.best_params_)
print('Best Accuracy: ', grid_search_forest.best_score_)

# here best_forest is the best random forest model
best_forest = grid_search_forest.best_estimator_
print(best_forest)
# test accuracy
# score method calls the predict method and then compares the predicted values with the actual values
test_accuracy_forest = best_forest.score(Xtest, ytest)
print('Test Accuracy: ', test_accuracy_forest)

Best Parameters:  {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best Accuracy:  0.8375
RandomForestClassifier(max_depth=10, min_samples_leaf=2, random_state=122)
Test Accuracy:  0.9


In [4]:
# 4th Models - Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

# create the MinMaxScaler object
scaler = MinMaxScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xtest_scaled = scaler.transform(Xtest)


# create a logistic regression Classifier
logreg = LogisticRegression(random_state=122, max_iter=5000)

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

grid_search_logreg = GridSearchCV(
    logreg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search_logreg.fit(Xtrain_scaled, ytrain)

print('Best Parameters: ', grid_search_logreg.best_params_)
print('Best Accuracy: ', grid_search_logreg.best_score_)

# get the best model
best_logreg = grid_search_logreg.best_estimator_

# evaluate the best model on the test set
test_accuracy = best_logreg.score(Xtest_scaled, ytest)
print('Test Accuracy: ', test_accuracy)

Best Parameters:  {'C': 1, 'penalty': 'l1', 'solver': 'saga'}
Best Accuracy:  0.8625
Test Accuracy:  0.95


In [13]:
# Combine predictions using a hybrid approach
# import accuracy_score from sklearn
from sklearn.metrics import accuracy_score
hybrid_predictions = []
for i in range(len(Xtest)):
    best_forest_pred = best_forest.predict(Xtest)[i]
    best_logreg_pred = best_logreg.predict(Xtest_scaled)[i]
    if best_forest_pred == best_logreg_pred:
        hybrid_predictions.append(best_forest_pred)
    else:
        hybrid_predictions.append(best_forest_pred)
        
hybrid_accuracy = accuracy_score(ytest, hybrid_predictions)
# the accuracy should not be rounded off
hybrid_accuracy = round(hybrid_accuracy, 3)
print("Hybrid Accuracy:", hybrid_accuracy)

print('Hybrid Accuracy: ', hybrid_accuracy)

Hybrid Accuracy: 0.9
Hybrid Accuracy:  0.9
