In [1]:
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer

In [2]:
df = pd.read_csv("ColonInColumns.csv")

In [3]:
df.text

0       Hospital: Random NHS Foundation Trust\nHospita...
1       Hospital: Random NHS Foundation Trust\nHospita...
2       Hospital: Random NHS Foundation Trust\r\nHospi...
3       Hospital: Random NHS Foundation Trust\r\nHospi...
4       Hospital: Random NHS Foundation Trust\r\nHospi...
                              ...                        
2100    Hospital: Random NHS Foundation Trust\r\nHospi...
2101    Hospital: Random NHS Foundation Trust\r\nHospi...
2102    Hospital: Random NHS Foundation Trust\r\nHospi...
2103    Hospital: Random NHS Foundation Trust\r\nHospi...
2104    Hospital: Random NHS Foundation Trust\r\nHospi...
Name: text, Length: 2105, dtype: object

In [4]:

df.text[0]


'Hospital: Random NHS Foundation Trust\nHospital Number: H2890235\nPatient Name:  al-Bilal, Widdad\nGeneral Practitioner: Dr. Mondragon, Amber\nDate of procedure:  2002-11-03\nEndoscopist: Dr. Monroe, Danjai\nSecond endoscopist: Dr. Hernandez, Valerie\nMedications: Fentanyl  12.5mcg\nMidazolam  5mg\nInstrument:  FC7\nExtent of Exam:  Descending Colon\nIndications: IBD Surveillance\nProcedure Performed: Colonoscopy\nFindings:  He reports taking a week of ibuprofen for a tooth infection stopping them last tuesday.,SIGMOID : Moderate diverticulsosis.,Tortuous sigmoid colon.,Biopsies taken from TI adn R&L colon.,Otherwise normal.,There were 2 subepithelial lipomas in the caecum and ascending colon .,1 aphthous ulcer only, not CD type, in ileum.,Multiple hyperplastic appearing polyps at rectosigmoid largest 6mm - one representative polyp taken.,Normal colonic mucosa to the caecum.\n'

In [5]:
def regex_hosp(string):
    hospital_reg = r"\.*Hospital:.*"
    return re.findall(hospital_reg, string)[0]
df["foundation_trust"] = df['text'].apply(regex_hosp)

In [6]:
def regex_hosp_num(string):
    hospital_reg = r"\.*Hospital Number.*"
    return re.findall(hospital_reg, string)[0]
df["hospital_num"] = df['text'].apply(regex_hosp_num)

In [7]:
def regex_patient_name(string):
    hospital_reg = r"\.*Patient Name:.*"
    line = re.findall(hospital_reg, string)[0]
    line_string =  line.split(":")[1]
    retrn_string = line_string.strip()
    if retrn_string[-1:] == "\r":
        return retrn_string[:-1]
    else:
        return retrn_string
df["patient_name"] = df['text'].apply(regex_patient_name)

In [8]:
def regex_GP(string):
    hospital_reg = r"\.*General Practitioner:.*"
    retrn_string = re.findall(hospital_reg, string)[0]
    if retrn_string[-1:] == "\r":
        return retrn_string[:-1]
    else:
        return retrn_string
df["gp"] = df['text'].apply(regex_GP)

In [9]:
def regex_procedure_date(string):
    hospital_reg = r"\.*Date of procedure:.*"
    line =  re.findall(hospital_reg, string)[0]
    retrn_string =  line.split(":")[1]
    if retrn_string[-1:] == "\r":
        return retrn_string[:-1]
    else:
        return retrn_string
df["procedure_date"] = df['text'].apply(regex_procedure_date)

In [10]:
def regex_endoscopist(string):
    hospital_reg = r"\.*Endoscopist:.*"
    retrn_string = re.findall(hospital_reg, string)[0]
    if retrn_string[-1:] == "\r":
        return retrn_string[:-1]
    else:
        return retrn_string
df["endoscopist"] = df['text'].apply(regex_endoscopist)

In [11]:
def regex_2nd_endoscopist(string):
    hospital_reg = r"\.*Second endoscopist:.*"
    retrn_string= re.findall(hospital_reg, string)[0]
    if retrn_string[-1:] == "\r":
        return retrn_string
    else:
        return retrn_string
df["second_endoscopist"] = df['text'].apply(regex_2nd_endoscopist)

In [12]:
def regex_medication(string):
    hospital_reg = r"\d*.\dmcg"
    retrn_string= re.findall(hospital_reg, string)[0]
    if retrn_string[-1:] == "\r":
        return float(retrn_string[:-4])
    else:
        return float(retrn_string[:-3])
df["medications_fentynl"] = df['text'].apply(regex_medication)

In [13]:
def regex_midazolam(string):
    hospital_reg = r"\.*Midazolam.*"
    line = re.findall(hospital_reg, string)[0]
    retrn_string =  line.split()[1]
    if retrn_string[-1:] == "\r":
        return int(retrn_string[:-3])
    else:
        return int(retrn_string[:-2])
df["midazolam"] = df['text'].apply(regex_midazolam)

In [14]:
def regex_instrument(string):
    hospital_reg = r"\.*Instrument.*"
    line = re.findall(hospital_reg, string)[0]
    retrn_string = line.split(":")[1]
    if retrn_string[-1:] == "\r":
        return retrn_string[:-1]
    else:
        return retrn_string
df["instrument"] = df['text'].apply(regex_instrument)

In [15]:
def regex_extent(string):
    hospital_reg = r"\.*Extent of Exam:.*"
    line = re.findall(hospital_reg, string)[0]
    retrn_string = line.split(":")[1]
    if retrn_string[-1:] == "\r":
        return retrn_string[:-1]
    else:
        return retrn_string
df["extent_of_exam"] = df['text'].apply(regex_extent)

In [16]:
def regex_indications(string):
    hospital_reg = r"\.*Indications:.*"
    line = re.findall(hospital_reg, string)[0]
    retrn_string= line.split(":")[1]
    if retrn_string[-1:] == "\r":
        return retrn_string[:-1]
    else:
        return retrn_string
df["indications"] = df['text'].apply(regex_indications)

In [17]:
def regex_procedure(string):
    hospital_reg = r"\.*Procedure Performed:.*"
    line = re.findall(hospital_reg, string)[0]
    retrn_string = line.split(":")[1]
#     return retrn_string[-1:]
    if retrn_string[-1:] == "\r":
        return retrn_string[:-1]
    else:
        return retrn_string
     


In [18]:
regex_procedure(df.text[3])

' Colonoscopy'

In [19]:
df["procedure_performed"] = df['text'].apply(regex_procedure)

In [20]:
def regex_findings(string):
    hospital_reg = r"\.*Findings:.*"
    line = re.findall(hospital_reg, string)[0][10:]
    return line
df["findings"] = df['text'].apply(regex_findings)

In [21]:
regex_procedure(df.text[3])

' Colonoscopy'

In [22]:
df.shape

(2105, 16)

In [23]:
df.head()

Unnamed: 0,Num,text,foundation_trust,hospital_num,patient_name,gp,procedure_date,endoscopist,second_endoscopist,medications_fentynl,midazolam,instrument,extent_of_exam,indications,procedure_performed,findings
0,1,Hospital: Random NHS Foundation Trust\nHospita...,Hospital: Random NHS Foundation Trust,Hospital Number: H2890235,"al-Bilal, Widdad","General Practitioner: Dr. Mondragon, Amber",2002-11-03,"Endoscopist: Dr. Monroe, Danjai","Second endoscopist: Dr. Hernandez, Valerie",12.5,5,FC7,Descending Colon,IBD Surveillance,Colonoscopy,He reports taking a week of ibuprofen for a t...
1,2,Hospital: Random NHS Foundation Trust\nHospita...,Hospital: Random NHS Foundation Trust,Hospital Number: U3931964,"Martin, Kimberlyn","General Practitioner: Dr. Hasenack, Kyilyn",2009-04-19,"Endoscopist: Dr. Hasling, Emily","Second endoscopist: Dr. Guillory, Ashley",25.0,5,FC1,Descending Colon,Fe deficiency anaemia,Colonoscopy,"Mild erythema in rectum only.,ASCENDING COLON..."
2,3,Hospital: Random NHS Foundation Trust\r\nHospi...,Hospital: Random NHS Foundation Trust\r,Hospital Number: L5789714\r,"Chaney, Alexis","General Practitioner: Dr. Schlekeway, Larissa",2009-12-22,"Endoscopist: Dr. Phillips, Harlie","Second endoscopist: Dr. Fultz, Courtney\r",50.0,3,FC3,Recum,Chronic abdominal pain,Colonoscopy,solid and liquid stool throughout limitingcol...
3,4,Hospital: Random NHS Foundation Trust\r\nHospi...,Hospital: Random NHS Foundation Trust\r,Hospital Number: S4873283\r,"Gozeh, Alexandra","General Practitioner: Dr. Allen, Jasmine",2001-11-25,"Endoscopist: Dr. al-Saade, Waajida","Second endoscopist: Dr. Fultz, Courtney\r",50.0,3,FC3,Recum,Planned polypectomy,Colonoscopy,Difficult procedure due to looping and patien...
4,5,Hospital: Random NHS Foundation Trust\r\nHospi...,Hospital: Random NHS Foundation Trust\r,Hospital Number: D6009686\r,"Leon, Nataya","General Practitioner: Dr. Cotie, Jasmine",2011-11-25,"Endoscopist: Dr. Confer, Mouneek","Second endoscopist: Dr. Guillory, Ashley\r",150.0,2,FC4,Descending Colon,Fe deficiency anaemia,Colonoscopy,"Random biopsies taken.,Very limited views.,No..."


In [24]:
## having a look at value counts ##

In [25]:
df.medications_fentynl.value_counts()

125.0    344
50.0     322
100.0    311
150.0    293
75.0     285
12.5     278
25.0     272
Name: medications_fentynl, dtype: int64

In [26]:
df.midazolam.value_counts()

7    335
1    315
2    310
4    298
3    297
5    282
6    268
Name: midazolam, dtype: int64

In [27]:
df.extent_of_exam.value_counts()

  Recum                315
  Sigmoid              313
  Transverse Colon     311
  Ascending Colon      296
  Descending Colon     295
  Caecum               294
  Failed intubation    281
Name: extent_of_exam, dtype: int64

In [28]:
df.procedure_performed.value_counts()

 Colonoscopy    2105
Name: procedure_performed, dtype: int64

In [29]:
df.indications.value_counts()

 Family History CRC         217
 Fe deficiency anaemia      204
 Planned polypectomy        184
 Diarrrhoea                 183
 Chronic abdominal pain     182
 Therapeutic- Dilatation    170
 PR Bleeding                167
 Weight loss                163
 Other-                     163
 IBD Surveillance           158
 Nausea and/or Vomiting     158
 Abnormal Imaging           156
Name: indications, dtype: int64

In [30]:
df.endoscopist.value_counts()

Endoscopist: Dr. Phillips, Harlie          235
Endoscopist: Dr. Monroe, Danjai            232
Endoscopist: Dr. Hasling, Emily            220
Endoscopist: Dr. Rahat, Janisa             218
Endoscopist: Dr. al-Saade, Waajida         217
Endoscopist: Dr. Confer, Mouneek           217
Endoscopist: Dr. al-Huda, Ummu Kulthoom    204
Endoscopist: Dr. Steinbach, Lindsey        191
Endoscopist: Dr. Gebregzabheir, Kiara      190
Endoscopist: Dr. Rubio, Jessica            181
Name: endoscopist, dtype: int64

In [31]:
df.patient_name.value_counts()

Muniz, Domonique        3
Tapia, Randi            3
Patterson, Alyssa       2
Gay, Jamya              2
al-Daoud, Afeefa        2
                       ..
Valles Torres, Nubia    1
Lee, Y Nhi              1
Weiner, Brooklynn       1
Horsman, Holly          1
el-Noor, Mahdhoodha     1
Name: patient_name, Length: 1999, dtype: int64

In [32]:
df.loc[df['patient_name'] == "Muniz, Domonique"]

Unnamed: 0,Num,text,foundation_trust,hospital_num,patient_name,gp,procedure_date,endoscopist,second_endoscopist,medications_fentynl,midazolam,instrument,extent_of_exam,indications,procedure_performed,findings
430,431,Hospital: Random NHS Foundation Trust\r\nHospi...,Hospital: Random NHS Foundation Trust\r,Hospital Number: T1640766\r,"Muniz, Domonique","General Practitioner: Dr. al-Raad, Shameema",2015-05-02,"Endoscopist: Dr. Gebregzabheir, Kiara","Second endoscopist: Dr. Orozco, Amber\r",100.0,6,FC6,Sigmoid,Nausea and/or Vomiting,Colonoscopy,Procedure limited to the sigmoid due to poor ...
431,432,Hospital: Random NHS Foundation Trust\r\nHospi...,Hospital: Random NHS Foundation Trust\r,Hospital Number: T1640766\r,"Muniz, Domonique","General Practitioner: Dr. al-Raad, Shameema",2015-05-02,"Endoscopist: Dr. Gebregzabheir, Kiara","Second endoscopist: Dr. Orozco, Amber\r",100.0,6,FC6,Sigmoid,Nausea and/or Vomiting,Colonoscopy,Procedure limited to the sigmoid due to poor ...
432,433,Hospital: Random NHS Foundation Trust\r\nHospi...,Hospital: Random NHS Foundation Trust\r,Hospital Number: T1640766\r,"Muniz, Domonique","General Practitioner: Dr. al-Raad, Shameema",2015-05-02,"Endoscopist: Dr. Gebregzabheir, Kiara","Second endoscopist: Dr. Orozco, Amber\r",100.0,6,FC6,Sigmoid,Nausea and/or Vomiting,Colonoscopy,Procedure limited to the sigmoid due to poor ...


In [33]:
df.findings.shape

(2105,)

In [34]:
#df.findings.value_counts().sort_values(ascending=False)

In [35]:
df_doc_drugs = df.groupby(df.endoscopist).mean()

In [36]:
df_doc_drugs.head()

Unnamed: 0_level_0,Num,medications_fentynl,midazolam
endoscopist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Endoscopist: Dr. Confer, Mouneek",1103.769585,81.278802,3.898618
"Endoscopist: Dr. Gebregzabheir, Kiara",985.426316,71.907895,4.094737
"Endoscopist: Dr. Hasling, Emily",1041.390909,72.102273,3.85
"Endoscopist: Dr. Monroe, Danjai",1071.573276,85.668103,4.150862
"Endoscopist: Dr. Phillips, Harlie",1082.140426,76.43617,3.87234


In [37]:
df_indications = df.groupby(df.indications).mean()

In [38]:
df_indications.head()

Unnamed: 0_level_0,Num,medications_fentynl,midazolam
indications,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abnormal Imaging,1048.339744,73.477564,3.762821
Chronic abdominal pain,1154.049451,77.678571,3.895604
Diarrrhoea,1000.885246,75.34153,3.918033
Family History CRC,1027.714286,78.456221,4.0553
Fe deficiency anaemia,1090.166667,87.438725,4.093137


In [39]:
df_extent_exam = df.groupby(df.extent_of_exam).mean()

In [40]:
df_extent_exam.head()

Unnamed: 0_level_0,Num,medications_fentynl,midazolam
extent_of_exam,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ascending Colon,1046.483108,77.111486,4.125
Caecum,1042.57483,82.397959,4.122449
Descending Colon,1042.888136,79.70339,4.077966
Failed intubation,993.088968,79.982206,3.836299
Recum,1043.08254,77.619048,4.057143


In [43]:
df.findings[800]

' Single pseudopolyp with a necrotic looking head - removed with hot snare, some ooze from base, two resolution clips applied .,Clip applied with good effect due to ooze.,No gross abnormality seen but small polyps might have been missed.,Floppy and looping left colon which I suspect is contributing to his symptoms.,x2 angiodysplasia seen with small overlying clots.,ASCENDING COLON : and CAECUM: multiple ulcers involving the caecum and caecum/ascending junction, with cobblestoning and distortion of anatomy.,Diverticulosis in the sigmoid to the mid transverse colon - inverted diverticulum in sigmoid.,Left colon was looping and twisting.,Views upto distal sigmoid poor but no large lesions seen.,Mild and Patchy Inflammation- Proctitis.\r'