In [5]:
import pandas as pd

file_path = 'dataset.csv'

# Read the CSV file
df = pd.read_csv(file_path)

# Display the DataFrame
df.head()

Unnamed: 0,Id,Hospital,Type,Comment,CleanedComment,ExperienceDate,ExperienceDateString,Valence,Unit,Code,CommentLength
0,106,Almonte General Hospital,Inpatient,"Compassionate care, such professional staff fr...",compassionate professional staff emergency adm...,2021-02-01 00:00:00+00:00,2/1/2021,Positive,,,170
1,430,Almonte General Hospital,Inpatient,I was very pleased with the care & concern giv...,please given thankful health absolutely commen...,2020-06-01 00:00:00+00:00,6/1/2020,Positive,,,154
2,650,Almonte General Hospital,Inpatient,Thankfully I was admitted into the last surplu...,admit surplus wsa private washroom button fabu...,2017-08-01 00:00:00+00:00,8/1/2017,Positive,,,306
3,715,Almonte General Hospital,Inpatient,The nurses and doctors made giving birth durin...,staff staff birth pandemic possible given circ...,2020-05-01 00:00:00+00:00,5/1/2020,Positive,,,680
4,719,Almonte General Hospital,Inpatient,The nurses at the hospital were amazing! Could...,staff thankful,2018-06-01 00:00:00+00:00,6/1/2018,Positive,,,87


In [6]:
import re

def clean_type(value):
    if pd.isna(value):
        return None
    value = value.strip().lower()
    value = re.sub(r'[^a-z0-9 ]', '', value)  
    value = re.sub(r'\s+', ' ', value)        
    return value if value else None

In [7]:
type_mapping = {
    'general': 'General',
    'general general': 'General',
    'i general': 'General',
    'general.': 'General',
    'general,': 'General',
    'generol': 'General',
    'generpl': 'General',
    'genero': 'General',
    
    'inpatient': 'Inpatient',
    'inpatientic': 'Inpatient',

    'emergency': 'Emergency',
    'emergency department': 'Emergency',
    'emergency care': 'Emergency',
    'ed': 'Emergency',

    'edpec': 'Pediatric',
    'ppec': 'Pediatric',

    'day surgery': 'Day Surgery',

    'rehab': 'Rehabilitation',
    'rehabilitation': 'Rehabilitation',

    'medical': 'Medical/Surgical',
    'surgical': 'Medical/Surgical',
    'medsurg': 'Medical/Surgical',
    'med/surg': 'Medical/Surgical',

    'mat': 'Maternity',
    'maternity': 'Maternity',

    'cpes': 'Psychiatric',

    'ambulatory': 'Outpatient',
    'outpatient': 'Outpatient',

    'contact requested': 'General',

    'service alert': 'Service Alert',
    'service alert.': 'Service Alert',
    'service. alert': 'Service Alert',
    'service  alert': 'Service Alert',
    'service .alert': 'Service Alert',
    'serv0 alert': 'Service Alert',
    'setvice alert': 'Service Alert',
    'servite alert': 'Service Alert',
}

In [8]:
def map_processed_type(raw_type):
    cleaned = clean_type(raw_type)
    if cleaned is None:
        return None
    return type_mapping.get(cleaned, cleaned.title())

In [9]:
df['ProcessedType'] = df['Type'].apply(map_processed_type)

In [10]:
df.head()

Unnamed: 0,Id,Hospital,Type,Comment,CleanedComment,ExperienceDate,ExperienceDateString,Valence,Unit,Code,CommentLength,ProcessedType
0,106,Almonte General Hospital,Inpatient,"Compassionate care, such professional staff fr...",compassionate professional staff emergency adm...,2021-02-01 00:00:00+00:00,2/1/2021,Positive,,,170,Inpatient
1,430,Almonte General Hospital,Inpatient,I was very pleased with the care & concern giv...,please given thankful health absolutely commen...,2020-06-01 00:00:00+00:00,6/1/2020,Positive,,,154,Inpatient
2,650,Almonte General Hospital,Inpatient,Thankfully I was admitted into the last surplu...,admit surplus wsa private washroom button fabu...,2017-08-01 00:00:00+00:00,8/1/2017,Positive,,,306,Inpatient
3,715,Almonte General Hospital,Inpatient,The nurses and doctors made giving birth durin...,staff staff birth pandemic possible given circ...,2020-05-01 00:00:00+00:00,5/1/2020,Positive,,,680,Inpatient
4,719,Almonte General Hospital,Inpatient,The nurses at the hospital were amazing! Could...,staff thankful,2018-06-01 00:00:00+00:00,6/1/2018,Positive,,,87,Inpatient


In [11]:
df['ExperienceDate'] = pd.to_datetime(df['ExperienceDate'])
df['Year'] = df['ExperienceDate'].dt.year  
df['Month'] = df['ExperienceDate'].dt.month
df['Day'] = df['ExperienceDate'].dt.day
df.head()

Unnamed: 0,Id,Hospital,Type,Comment,CleanedComment,ExperienceDate,ExperienceDateString,Valence,Unit,Code,CommentLength,ProcessedType,Year,Month,Day
0,106,Almonte General Hospital,Inpatient,"Compassionate care, such professional staff fr...",compassionate professional staff emergency adm...,2021-02-01 00:00:00+00:00,2/1/2021,Positive,,,170,Inpatient,2021,2,1
1,430,Almonte General Hospital,Inpatient,I was very pleased with the care & concern giv...,please given thankful health absolutely commen...,2020-06-01 00:00:00+00:00,6/1/2020,Positive,,,154,Inpatient,2020,6,1
2,650,Almonte General Hospital,Inpatient,Thankfully I was admitted into the last surplu...,admit surplus wsa private washroom button fabu...,2017-08-01 00:00:00+00:00,8/1/2017,Positive,,,306,Inpatient,2017,8,1
3,715,Almonte General Hospital,Inpatient,The nurses and doctors made giving birth durin...,staff staff birth pandemic possible given circ...,2020-05-01 00:00:00+00:00,5/1/2020,Positive,,,680,Inpatient,2020,5,1
4,719,Almonte General Hospital,Inpatient,The nurses at the hospital were amazing! Could...,staff thankful,2018-06-01 00:00:00+00:00,6/1/2018,Positive,,,87,Inpatient,2018,6,1


In [1]:
!pip install scipy
!pip install scispacy



In [2]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_bc5cdr_md-0.5.3.tar.gz

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_bc5cdr_md-0.5.3.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_bc5cdr_md-0.5.3.tar.gz (119.8 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting spacy<3.7.0,>=3.6.1 (from en-ner-bc5cdr-md==0.5.3)
  Using cached spacy-3.6.1-cp39-cp39-win_amd64.whl.metadata (26 kB)
Collecting thinc<8.2.0,>=8.1.8 (from spacy<3.7.0,>=3.6.1->en-ner-bc5cdr-md==0.5.3)
  Using cached thinc-8.1.12-cp39-cp39-win_amd64.whl.metadata (15 kB)
Using cached spacy-3.6.1-cp39-cp39-win_amd64.whl (12.1 MB)
Using cached thinc-8.1.12-cp39-cp39-win_amd64.whl (1.5 MB)
Installing collected packages: thinc, spacy
  Attempting uninstall: thinc
    Found existing installation: thinc 8.2.5
    Uninstalling thinc-8.2.5:
      Successfully uninstalled thinc-8.2.5
  Attempting uninstall: spacy
    Found existing installation: spacy 3.7.5


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-sci-scibert 0.5.1 requires spacy<3.5.0,>=3.4.1, but you have spacy 3.6.1 which is incompatible.
scispacy 0.5.5 requires spacy<3.8.0,>=3.7.0, but you have spacy 3.6.1 which is incompatible.
