In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import miceforest as mf


# Load the dataset into a pandas dataframe
data = pd.read_csv('Hystero-Clean_NA.csv')
data = data.drop(['Patient Pain Score'], axis=1)

data =data.dropna(subset=['PatientPainNew'])
data.isna().sum()

Postmenopausal                                                                                                                                       0
History of Gyn Exams                                                                                                                                 0
Presurgical Analgesia                                                                                                                                0
Cervix preparation                                                                                                                                   0
Number of Births                                                                                                                                   156
Previous pregnancy complications SPSS                                                                                                                0
Gynae Indication                                                                              

In [2]:
# recode Yes & No into 1 & 0
data['Postmenopausal'] = data['Postmenopausal'].map({'Yes': 1, 'No': 0})
data['History of Gyn Exams'] = data['History of Gyn Exams'].map({'Yes': 1, 'No': 0})
data['Presurgical Analgesia'] = data['Presurgical Analgesia'].map({'Yes': 1, 'No': 0})
data['Cervix preparation'] = data['Cervix preparation'].map({'Yes': 1, 'No': 0})
data['Previous pregnancy complications SPSS'] = data['Previous pregnancy complications SPSS'].map({'Yes': 1, 'No': 0})
data['Have you had a hysteroscopy under general anaesthetic (while asleep) or in outpatients (awake) before today?RECODED [1:YES, 2:NO, 3:MISSING/NA]'] = data['Have you had a hysteroscopy under general anaesthetic (while asleep) or in outpatients (awake) before today?RECODED [1:YES, 2:NO, 3:MISSING/NA]'].map({'Yes': 1, 'No': 0})

## Features with missing values (number of missing)
* Number of Births                                                                                              156
* Gynae Indication                                                                                                8
* BMI                                                                                                            22
* Hysteroscopy Indication                                                                                        80
* Anaesthetic used                                                                                               38
* Diagnosis based on hysteroscopy                                                                                43
* Would you have hysteroscopy in outpatients again?                                                             248
* Was the comfort during hysteroscopy more or less than you expected?                                           258
* Were you satisfied with the overall experience during the hysteroscopy?                                       258
* Were you anxious / nervous about the hysteroscopy before you came today?                                      245
* Did the written hysteroscopy information you received before today make you feel anxious, nervous or scared?  252
* Doctor's Pain Assessment                                                                                        1

In [3]:
# Split the data into training and testing sets
X = data.drop(['PatientPainNew'], axis=1)
y = data['PatientPainNew']

In [4]:
# fill missing values with Scikit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# fill categorical values with 'missing'
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
num_imputer = SimpleImputer(strategy="mean")

# define columns
cat_features =["Number of Births","Gynae Indication","Hysteroscopy Indication","Anaesthetic used","Diagnosis based on hysteroscopy","Would you have hysteroscopy in outpatients again?","Was the comfort during hysteroscopy more or less than you expected?","Were you satisfied with the overall experience during the hysteroscopy?","Were you anxious / nervous about the hysteroscopy before you came today?","Did the written hysteroscopy information you received before today make you feel anxious, nervous or scared?","Doctor's Pain Assessment"]
num_features = ["BMI"]

# create an imputer
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("num_imputer", num_imputer, num_features)
])

filled_X = imputer.fit_transform(X)
X.loc[:, ["Number of Births","Gynae Indication","Hysteroscopy Indication","Anaesthetic used","Diagnosis based on hysteroscopy","Would you have hysteroscopy in outpatients again?","Was the comfort during hysteroscopy more or less than you expected?","Were you satisfied with the overall experience during the hysteroscopy?","Were you anxious / nervous about the hysteroscopy before you came today?","Did the written hysteroscopy information you received before today make you feel anxious, nervous or scared?","Doctor's Pain Assessment", "BMI"]] = filled_X
X

Unnamed: 0,Postmenopausal,History of Gyn Exams,Presurgical Analgesia,Cervix preparation,Number of Births,Previous pregnancy complications SPSS,Gynae Indication,BMI,Cervix,Endometrial biopsy,...,Surgery Performed,Would you have hysteroscopy in outpatients again?,Was the comfort during hysteroscopy more or less than you expected?,Were you satisfied with the overall experience during the hysteroscopy?,Were you anxious / nervous about the hysteroscopy before you came today?,"Have you had a hysteroscopy under general anaesthetic (while asleep) or in outpatients (awake) before today?RECODED [1:YES, 2:NO, 3:MISSING/NA]","Did the written hysteroscopy information you received before today make you feel anxious, nervous or scared?",Doctor's Pain Assessment,Smear Pain Score,Venesect Pain Score
0,1,0.0,0.0,,3.0,0,1.0,48.10000,not_stenosed,successful,...,Hysteroscopic polypectomy,"Yes, definitely",More,Very satisfied,Very nervous,,missing,,no_pain,no_pain
1,1,0.0,1.0,,2.0,0,1.0,29.53925,not_stenosed,successful,...,Diagnostic hysteroscopy and biopsy,"Yes, definitely",The same,Satisfied,Slightly nervous,,Not at all nervous,Discomfort,mild,mild
2,0,0.0,0.0,,1.0,1,1.0,36.60000,not_stenosed,none,...,Hysteroscopic polypectomy,"Yes, definitely",More,Very satisfied,Not nervous,,Did not receive,,no_pain,mild
3,1,0.0,1.0,,2.0,0,1.0,35.60000,not_stenosed,successful,...,Diagnostic hysteroscopy and biopsy,"Yes, definitely",More,Very satisfied,Very nervous,,Very nervous,,mild,no_pain
4,0,1.0,1.0,,3.0,0,1.0,30.10000,stenosed,successful,...,Diagnostic hysteroscopy and biopsy,"Yes, definitely",More,Very satisfied,Very nervous,,Slightly nervous,,moderate,mild
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
710,0,0.0,0.0,,1.0,0,1.0,19.70000,not_stenosed,successful,...,Diagnostic hysteroscopy and biopsy,missing,missing,missing,missing,,missing,Discomfort,7,mild
711,0,0.0,1.0,,2.0,0,1.0,26.50000,not_stenosed,successful,...,Diagnostic hysteroscopy and biopsy,Not sure,Less,Very satisfied,Very nervous,,Slightly nervous,Discomfort,7,mild
712,0,1.0,0.0,,2.0,1,1.0,34.30000,not_stenosed,successful,...,Diagnostic hysteroscopy and biopsy,"Yes, probably",Less,Very satisfied,Very nervous,,Very nervous,Discomfort,mild,no_pain
713,0,0.0,1.0,,0.0,0,1.0,56.50000,not_stenosed,successful,...,Hysteroscopic polypectomy,"No, definitely",Less,Satisfied,Not nervous,,Not at all nervous,Moderate,moderate,mild


In [5]:
X["Number of Births"]=X["Number of Births"].astype(str)
X["Gynae Indication"]=X["Gynae Indication"].astype(str)
X["Hysteroscopy Indication"]=X["Hysteroscopy Indication"].astype(str)
X["Anaesthetic used"]=X["Anaesthetic used"].astype(str)
X["Diagnosis based on hysteroscopy"]=X["Diagnosis based on hysteroscopy"].astype(str)
X["Would you have hysteroscopy in outpatients again?"]=X["Would you have hysteroscopy in outpatients again?"].astype(str)
X["Was the comfort during hysteroscopy more or less than you expected?"]=X["Was the comfort during hysteroscopy more or less than you expected?"].astype(str)
X["Were you satisfied with the overall experience during the hysteroscopy?"]=X["Were you satisfied with the overall experience during the hysteroscopy?"].astype(str)
X["Were you anxious / nervous about the hysteroscopy before you came today?"]=X["Were you anxious / nervous about the hysteroscopy before you came today?"].astype(str)
X["Did the written hysteroscopy information you received before today make you feel anxious, nervous or scared?"]=X["Did the written hysteroscopy information you received before today make you feel anxious, nervous or scared?"].astype(str)
X["Doctor's Pain Assessment"] = X["Doctor's Pain Assessment"].astype(str)


In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features=["Postmenopausal", "History of Gyn Exams", "Presurgical Analgesia","Cervix preparation", "Number of Births","Previous pregnancy complications SPSS", "Gynae Indication", "Cervix", "Endometrial biopsy", "Hysteroscopy Indication", "Anaesthetic used", "Diagnosis based on hysteroscopy", "Surgery Performed", "Would you have hysteroscopy in outpatients again?", "Was the comfort during hysteroscopy more or less than you expected?", "Were you satisfied with the overall experience during the hysteroscopy?", "Were you anxious / nervous about the hysteroscopy before you came today?","Have you had a hysteroscopy under general anaesthetic (while asleep) or in outpatients (awake) before today?RECODED [1:YES, 2:NO, 3:MISSING/NA]", "Did the written hysteroscopy information you received before today make you feel anxious, nervous or scared?", "Doctor's Pain Assessment", "Smear Pain Score", "Venesect Pain Score"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                  one_hot,
                                  categorical_features)],
                                  remainder="passthrough")
transformered_X = transformer.fit_transform(X)
transformered_X

<715x96 sparse matrix of type '<class 'numpy.float64'>'
	with 16630 stored elements in Compressed Sparse Row format>

# Three main things we have to do:
1. Split the data into features and labels (usually 'X' and 'y')
2. Filling (also called imputing) or disregarding the missing values
3. Converting non-numerical values to numerical values (also called features encoding)

In [7]:
# Split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(transformered_X,
                                                   y,
                                                   test_size=0.2,
                                                   random_state=42)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((572, 96), (143, 96), (572,), (143,))

In [9]:
X.shape, y.shape

((715, 24), (715,))

## Make sure it is all numerical

In [10]:
#dummies = pd.get_dummies(X[["Postmenopausal", "History of Gyn Exams", "Presurgical Analgesia","Cervix preparation", "Number of Births","Previous pregnancy complications SPSS", "Gynae Indication", "BMI", "Cervix", "Endometrial biopsy", "Hysteroscopy Indication", "Anaesthetic used", "Number of Polyps", "Diagnosis based on hysteroscopy", "Surgery Performed", "Would you have hysteroscopy in outpatients again?", "Was the comfort during hysteroscopy more or less than you expected?", "Were you satisfied with the overall experience during the hysteroscopy?", "Were you anxious / nervous about the hysteroscopy before you came today?","Have you had a hysteroscopy under general anaesthetic (while asleep) or in outpatients (awake) before today?RECODED [1:YES, 2:NO, 3:MISSING/NA]", "Did the written hysteroscopy information you received before today make you feel anxious, nervous or scared?", "Doctor's Pain Assessment", "Patient Pain Score", "Smear Pain Score", "Venesect Pain Score"]])
#dummies

## 1.2 What if there were missing values?
1. Fill them with some value (imputation)
2. Remove the samples with missing data altogether

https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [11]:
from sklearn.svm import LinearSVC
np.random.seed(42)

clf = LinearSVC(max_iter =10000)
clf.fit(X_train, y_train)

clf.score(X_test, y_test)



0.4825174825174825

In [12]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

clf.score(X_test, y_test)


0.44755244755244755

In [17]:
transformered_X

<715x96 sparse matrix of type '<class 'numpy.float64'>'
	with 16630 stored elements in Compressed Sparse Row format>

In [19]:
clf.predict_proba(X_test[:5])

array([[0.43, 0.09, 0.43, 0.05],
       [0.24, 0.56, 0.02, 0.18],
       [0.27, 0.23, 0.47, 0.03],
       [0.42, 0.4 , 0.03, 0.15],
       [0.26, 0.56, 0.  , 0.18]])

In [20]:
clf.predict(X_test[:5])

array(['mild', 'moderate', 'no_pain', 'mild', 'moderate'], dtype=object)

In [21]:
# evaluating a model with the score method
clf.score(X_test, y_test)


0.44755244755244755

In [23]:
clf.score(X_train, y_train)

1.0

In [25]:
# evaluating a model with the scoring parameter
from sklearn.model_selection import cross_val_score

cross_val_score(clf, transformered_X, y, cv = 50)

array([0.6       , 0.6       , 0.46666667, 0.26666667, 0.4       ,
       0.4       , 0.4       , 0.33333333, 0.46666667, 0.53333333,
       0.53333333, 0.46666667, 0.53333333, 0.4       , 0.46666667,
       0.5       , 0.21428571, 0.42857143, 0.28571429, 0.5       ,
       0.71428571, 0.64285714, 0.42857143, 0.71428571, 0.5       ,
       0.28571429, 0.35714286, 0.35714286, 0.42857143, 0.28571429,
       0.71428571, 0.57142857, 0.42857143, 0.28571429, 0.42857143,
       0.5       , 0.42857143, 0.35714286, 0.5       , 0.57142857,
       0.57142857, 0.42857143, 0.42857143, 0.57142857, 0.42857143,
       0.35714286, 0.5       , 0.42857143, 0.42857143, 0.35714286])

In [27]:
# Area under the ROC curve
from sklearn.metrics import roc_curve

y_probs = clf.predict_proba(X_test)


array([[0.43, 0.09, 0.43, 0.05],
       [0.24, 0.56, 0.02, 0.18],
       [0.27, 0.23, 0.47, 0.03],
       [0.42, 0.4 , 0.03, 0.15],
       [0.26, 0.56, 0.  , 0.18],
       [0.57, 0.23, 0.12, 0.08],
       [0.62, 0.26, 0.06, 0.06],
       [0.34, 0.45, 0.03, 0.18],
       [0.26, 0.62, 0.04, 0.08],
       [0.4 , 0.39, 0.02, 0.19],
       [0.48, 0.28, 0.2 , 0.04],
       [0.15, 0.69, 0.  , 0.16],
       [0.22, 0.3 , 0.  , 0.48],
       [0.5 , 0.33, 0.11, 0.06],
       [0.29, 0.54, 0.03, 0.14],
       [0.22, 0.54, 0.04, 0.2 ],
       [0.3 , 0.56, 0.06, 0.08],
       [0.24, 0.6 , 0.04, 0.12],
       [0.33, 0.45, 0.04, 0.18],
       [0.52, 0.36, 0.04, 0.08],
       [0.56, 0.24, 0.05, 0.15],
       [0.39, 0.41, 0.06, 0.14],
       [0.37, 0.28, 0.02, 0.33],
       [0.4 , 0.29, 0.1 , 0.21],
       [0.39, 0.27, 0.07, 0.27],
       [0.48, 0.2 , 0.23, 0.09],
       [0.35, 0.47, 0.04, 0.14],
       [0.32, 0.59, 0.03, 0.06],
       [0.68, 0.25, 0.  , 0.07],
       [0.14, 0.27, 0.  , 0.59],
       [0.