In [None]:
import re

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

pd.options.future.infer_string = True

In [None]:
def to_snake_case_columns(df):
    df.columns = [re.sub(r'[^0-9a-zA-Z]+', '_', col).strip('_').lower() for col in df.columns]
    return df

In [None]:
df = (
    pd.read_csv(
        '../data/raw/sat_hack_data.csv', 
        dtype={
            'Attendance_Category': 'str', 
            'Treatment_Function_Code': 'str', 
            'Palliative_Care_Description': 'str',
            }
        )
)

df = to_snake_case_columns(df)
df['attendance_category'] = df['attendance_category'].replace('X', np.nan).astype(float)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df["frequent_attender"] = df["activity_within_12m_of_first_attend"].map(lambda x: 1 if x >= 3 else 0)

In [None]:
df[['activity_within_12m_of_first_attend', 'frequent_attender']].head(10)

In [None]:
cols = ['nhs_number', 'organisation_code_provider',
       'organisation_code_commissioner', 'age_at_arrival', 'lsoa_11',
       'index_of_multiple_deprivation',
       'index_of_multiple_deprivation_description', 'stated_gender',
       'ethnic_category', 'accommodation_status_desc',
       'lsoa_site_of_treatment_distance', 'arrival_datetime',
       'arrival_mode_desc', 'attendance_category',
       'departure_time_since_arrival', 'treatment_function_code',
       'discharge_status_desc', 'destination_desc', 'acuity_desc',
       'acuity_code_approved', 'long_term_condition_asthma_flag',
       'long_term_condition_cancer_flag',
       'long_term_condition_heart_failure_flag',
       'long_term_condition_diabetes_flag', 'long_term_condition_renal_flag',
       'long_term_condition_copd_flag', 'long_term_condition_dementia_flag',
       'long_term_condition_count_number', 'gp_practice_code', 'gp_practice',
       'patient_status', 'care_home_status', 'care_home_name', 'living_alone',
       'palliative_care_flag', 'palliative_care_description',
       'acutely_unwell_flag', 'disability_speech_flag',
       'disability_hearing_flag', 'disability_sight_flag',
       'disability_learning_disability_flag', 'disability_count_number',
       'segmentation_bridges_to_health',
       'segmentation_bridges_to_health_description',
       'all_long_term_condition_count_number', 'all_long_term_condition_count',
       'all_long_term_conditions', 'patient_registration_status', 'frequent_attender']

In [None]:
train_df, test_df = train_test_split(df[cols], test_size = 0.3, stratify=df['frequent_attender'], random_state=42)

In [None]:
print(f'{train_df['frequent_attender'].mean():.4f}')

In [None]:
print(f'{test_df['frequent_attender'].mean():.4f}')

In [None]:
train_df.columns

In [None]:
train_df.to_csv('train.csv', index=False)

In [None]:
df.isna().sum().sort_values()

In [None]:
test_df = test_df.drop(['palliative_care_description', 'accommodation_status_desc', 'treatment_function_code'], axis=1)

In [None]:
test_df = test_df.dropna()

In [None]:
test_df.shape

In [None]:
test_df.to_csv('../data/test.csv', index=False)

In [None]:
pd.read_csv('../data/train.csv')