# SAT Machine Learning Hackathon Team $\Sigma \Omega$

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss
from sklearn.model_selection import train_test_split

pd.options.future.infer_string = True

df = (
    pd.read_csv(
        'data/train.csv', 
        dtype={
            'attendance_category': 'str', 
            'treatment_function_code': 'str', 
            'palliative_care_description': 'str',
            }
        )
)

df = df.drop(columns=[
    'nhs_number',
    'organisation_code_provider',
    'organisation_code_commissioner',
    'lsoa_11',
    'index_of_multiple_deprivation_description',
    'accommodation_status_desc',
    'treatment_function_code',
    'acuity_code_approved',
    'gp_practice',
    'gp_practice_code', # high cardinality categorical variable. Take out for now and bring back for CatBoost
    'palliative_care_description',
    'care_home_name',
    'patient_status', # leaks data i.e. if someone has died, can they reattend?
    'all_long_term_conditions',
    'segmentation_bridges_to_health_description',
    'patient_registration_status',
    'all_long_term_condition_count',
    'attendance_category'
]
)
df['stated_gender'] = df['stated_gender'].astype(str)
df['stated_gender'] = df['stated_gender'].replace('nan','X') # unknown is meant to be X. 9 is unable to classify as one or the other.
df['stated_gender'] = df['stated_gender'].str.replace('.0','',regex=False)
df['stated_gender'].unique()
df['arrival_datetime'] = pd.to_datetime(df['arrival_datetime'], dayfirst=True)
df['arrival_time'] = df['arrival_datetime'].dt.time
df['arrival_outside_of_core_gp_hours'] = df['arrival_datetime'].dt.hour.apply(lambda x: 1 if (x < 8) | (x >= 18) else 0)
df = df.dropna()

In [None]:

age_bins = [0, 18, 30, 45, 65, 80, 100]
age_labels = ['0-17', '18-29', '30-44', '45-64', '65-79', '80+']
df['age_bin'] = pd.cut(df['age_at_arrival'], bins=age_bins, labels=age_labels, right=False)
sns.countplot(data=df, x='age_bin', hue='frequent_attender', dodge=False)
plt.ylabel('Count')
plt.show()

In [None]:
import matplotlib.ticker as mticker

fraction_df = (
    df.groupby('long_term_condition_count_number')['frequent_attender']
    .value_counts(normalize=True)
    .unstack()
)

fraction_df.plot(
	kind='bar',
	stacked=True,
	figsize=(10, 6),
	color=["#0062FF", '#DD8452']
)

plt.legend(title='Frequent Attender')
plt.tight_layout()


ax = plt.gca()
ax.yaxis.set_major_formatter(mticker.PercentFormatter(xmax=1, decimals=0))
ax.legend(title='Frequent Attender', labels=['No', 'Yes'])
plt.show()

In [None]:
sns.histplot(x=df['departure_time_since_arrival'][df['departure_time_since_arrival'] <= 600],bins=50)

In [None]:
import matplotlib.ticker as mticker

fraction_df2 = (
	df.groupby('destination_desc')['frequent_attender']
	.value_counts(normalize=True)
	.unstack()
)

fraction_df2.plot(
	kind='barh',  # horizontal bar plot
	stacked=True,
	figsize=(10, 7),
)

ax = plt.gca()
ax.xaxis.set_major_formatter(mticker.PercentFormatter(xmax=1, decimals=0))
plt.show()

In [None]:
y = df['frequent_attender']
X = df.drop(columns=['frequent_attender'])
X_features = X.columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
 

In [None]:
def is_over_70(row):
    if row['age_at_arrival'] > 69:
        return 1
    else:
        return 0
 
X_train['over_70'] = X_train.apply(is_over_70, axis = 1)

In [None]:
X_train["is_weekend"] = (X_train['arrival_datetime'].dt.dayofweek >= 5).astype(int)

In [None]:
def is_4hr_wait(row):
    if row['departure_time_since_arrival'] > 240:
        return 1
    else:
        return 0
   
X_train['four_hr_Wait'] = X_train.apply(is_4hr_wait, axis=1)

In [None]:
X_train = pd.concat([X_train, pd.get_dummies(X_train['stated_gender'], prefix='gender_code', dtype=int)], axis=1)