In [26]:
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
from collections import Counter

# DiCE imports
import dice_ml
from dice_ml.utils import helpers  

In [51]:
dataset = pd.read_csv('risk_factors_cervical_cancer.csv')
dataset = dataset.drop(['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis', 'Hinselmann', 'Schiller',
       'Citology'],axis=1)
dataset = dataset.replace('?',np.NaN)
dataset = dataset.dropna(how='any')
LE = LabelEncoder()
dataset['Biopsy'] = LE.fit_transform(dataset['Biopsy'])
display(dataset.head())
print(dataset['Biopsy'].value_counts())

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:AIDS,STDs:HIV,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0,1,0,1,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0
5,42,3.0,23.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0


0    623
1     45
Name: Biopsy, dtype: int64


In [52]:
target = dataset["Biopsy"]
X = dataset.drop(['Biopsy'],axis=1)
    
ros = SMOTE()
# resampling X, y
dataset, target = ros.fit_resample(X, target)
# new class distribution 
print(Counter(target))
print(type(dataset))

dataset = pd.concat([dataset,target],axis=1)

for col in dataset.columns:
    dataset[col] = dataset[col].astype(str).astype(float)

train_dataset, test_dataset, y_train, y_test = train_test_split(dataset,
                                                                target,
                                                                test_size=0.2,
                                                                random_state=0,
                                                                stratify=target)
x_train = train_dataset.drop('Biopsy', axis=1)
x_test = test_dataset.drop('Biopsy', axis=1)

Counter({0: 623, 1: 623})
<class 'pandas.core.frame.DataFrame'>


In [53]:
dataset.columns

Index(['Age', 'Number of sexual partners', 'First sexual intercourse',
       'Num of pregnancies', 'Smokes', 'Smokes (years)', 'Smokes (packs/year)',
       'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD',
       'IUD (years)', 'STDs', 'STDs (number)', 'STDs:condylomatosis',
       'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
       'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
       'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
       'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV',
       'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis',
       'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Biopsy'],
      dtype='object')

In [54]:
d = dice_ml.Data(dataframe=train_dataset, continuous_features=['Age','First sexual intercourse','Smokes (years)','Smokes (packs/year)','Hormonal Contraceptives (years)','IUD (years)'], outcome_name='Biopsy')

In [55]:
numerical = ['Age','First sexual intercourse','Smokes (years)','Smokes (packs/year)','Hormonal Contraceptives (years)','IUD (years)']
categorical = x_train.columns.difference(numerical)

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical)])

# Append classifier to preprocessing pipeline.

clf_rf = Pipeline(steps=[('preprocessor', transformations),
                      ('clf', RandomForestClassifier(random_state=42))])

clf_svm = Pipeline(steps=[('preprocessor', transformations),
                      ('clf', SVC(random_state=42))])

grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
                   'clf__max_features': ['auto', 'sqrt'],
                   'clf__n_estimators': [int(x) for x in np.linspace(start = 128, stop = 384, num = 32)],
                   'clf__min_samples_split': [2, 5, 10],
                   'clf__max_depth': [int(x) for x in np.linspace(start = 5, stop = 8, num = 1)]}]

grid_params_svm = [{'clf__kernel': ['poly', 'rbf','sigmoid'], 
        'clf__C': [0.1, 1, 10, 100, 1000],
        'clf__gamma': [1, 0.1, 0.01, 0.001, 0.0001]}]

jobs = -1

RF = GridSearchCV(estimator=clf_rf,
            param_grid=grid_params_rf,
            scoring='roc_auc',
            cv=10, 
            n_jobs=jobs)


SVM = GridSearchCV(estimator=clf_svm,
            param_grid=grid_params_svm,
            scoring='roc_auc',
            cv=10,
            n_jobs=jobs)

grids = [RF,SVM]

In [56]:
# Creating a dict for our reference
grid_dict = {0: 'Random Forest', 
        1: 'Support Vector Machine'}

best_rf = {'score': 0, 'best_model':0}
best_svm = {'score': 0, 'best_model':0}

# Fit the grid search objects
print('Performing model optimizations...')

for idx, model in enumerate(grids):
    
    print('\nEstimator: %s' % grid_dict[idx])
    model.fit(x_train, y_train)
    print('Best params are : %s' % model.best_params_)
    
    # Best training data accuracy
    print('Best training accuracy: %.3f' % model.best_score_)
    
    # Predict on test data with best params
    y_pred = model.predict(x_test)
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
    print('Precision score: {}'.format(precision_score(y_test, y_pred)))
    print('Recall score: {}'.format(recall_score(y_test, y_pred)))
    print('F1 score: {}'.format(f1_score(y_test, y_pred)))
    print('AUC-ROC score: {}'.format(roc_auc_score(y_test, y_pred)))
    
    # Track best (highest test accuracy) model
    if idx == 0:
        if roc_auc_score(y_test, y_pred) > best_rf['score']:
            best_rf['score'] = roc_auc_score(y_test, y_pred)
            best_rf['best_model'] = model.best_params_ 
    
    elif idx == 1:
        if roc_auc_score(y_test, y_pred) > best_svm['score']:
            best_svm['score'] = roc_auc_score(y_test, y_pred)
            best_svm['best_model'] = model.best_params_
            
# save dict to file
import json

with open('best_cervicalcancer_rf_params.txt', 'w') as file:
     file.write(json.dumps(best_rf)) # use `json.loads` to do the reverse
        
with open('best_cervicalcancer_svm_params.txt', 'w') as file:
     file.write(json.dumps(best_svm)) # use `json.loads` to do the reverse

Performing model optimizations...

Estimator: Random Forest
Best params are : {'clf__criterion': 'gini', 'clf__max_depth': 5, 'clf__max_features': 'auto', 'clf__min_samples_split': 5, 'clf__n_estimators': 243}
Best training accuracy: 0.940
Test set accuracy score for best params: 0.856 
Precision score: 0.9405940594059405
Recall score: 0.76
F1 score: 0.8407079646017699
AUC-ROC score: 0.856

Estimator: Support Vector Machine
Best params are : {'clf__C': 1, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}
Best training accuracy: 0.970
Test set accuracy score for best params: 0.928 
Precision score: 1.0
Recall score: 0.856
F1 score: 0.9224137931034483
AUC-ROC score: 0.9279999999999999
