In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

dataset_file = 'JS_NCCH.csv'
full_df = pd.read_csv(dataset_file)

sns.set_style('whitegrid')

full_df['MAT_SMOKING_NC'] = full_df['MAT_SMOKING_NC'].replace(9, np.nan)
full_df['LABOUR_ONSET_NC'] = full_df['LABOUR_ONSET_NC'].replace(9, np.nan)
full_df['CHILD_ETHNIC_GRP_NC'] = full_df['CHILD_ETHNIC_GRP_NC'].replace('Z', np.nan)

missing_ratio = full_df.isnull().mean() * 100
missing_ratio = missing_ratio[missing_ratio > 0].sort_values(ascending=False)

def visualise_missingness(missing_ratio):
    print(missing_ratio)
    plt.figure(figsize=(10, 6))
    sns.barplot(x=missing_ratio.values, y=missing_ratio.index)
    plt.title('Missingness Ratio per variable')
    plt.xlabel('Percentage missing')
    plt.ylabel('Variable')
    plt.show()

visualise_missingness(missing_ratio)

num_cols = ['BIRTH_WEIGHT_NC', 'GEST_AGE_NC', 'MAT_AGE_NC', 'APGAR_1_NC', 'APGAR_2_NC', 'PREV_STILLBIRTH_NC']
cat_cols = ['BIRTH_WEIGHT_CAT_NC', 'GEST_AGE_CAT_NC', 'CHILD_ETHNIC_GRP_NC', 'LHB_CD_BIRTH_NC', 'LABOUR_ONSET_NC', 'BREASTFEED_8_WKS_FLG_NC', 'MAT_SMOKING_NC', 'BIRTH_ORDER_NC', 'PREV_STILLBIRTH_NC']
full_df = full_df[(full_df['GEST_AGE_NC'] >= 15) & (full_df['GEST_AGE_NC'] <= 60) & (full_df['BIRTH_WEIGHT_NC'] >= 0.5) & (full_df['BIRTH_WEIGHT_NC'] <= 11)].round(2)
df = full_df.dropna(subset=(num_cols + cat_cols))

In [None]:

def visualise_continuous_distributions():
    summary = {}
    for col in num_cols:
        mean = full_df[col].dropna().mean()
        std = full_df[col].dropna().std()
        mini = full_df[col].min()
        maxi = full_df[col].max()
        print(f'Min: {mini}; Max: {maxi}')
        summary[col] = f'Mean: {mean}, with std of {std}'
        print(f'Mean: {mean}, with std of {std}')
        plt.figure(figsize=(12, 4))
        sns.histplot(df[col], kde=True, bins=30, color='skyblue')
        plt.axvline(mean, color='r', linestyle='--', label=f'Mean: {mean:.2f}')
        plt.axvline(mean + std, color='g', linestyle='--', label=f'Mean + std: {mean + std:.2f}')
        plt.axvline(mean - std, color='g', linestyle='--', label=f'Mean - std: {mean - std:.2f}')
        plt.title(f'Distribution of {col}')
        plt.show()
    print(summary)

def visualise_categorical_counts():
    summary = {}
    new_df = full_df.copy()
    for col in cat_cols:
        if col == 'MAT_SMOKING_NC':
            new_df[col] = new_df[col].replace(9.0, np.nan)
            summary[col] = new_df[col].value_counts(normalize=True) * 100
        elif col == 'LSOA_CD_BIRTH_NC':
            summary[col] = new_df[col].value_counts()
            summary[col] = summary[col][summary[col] > 100]
        else:
            summary[col] = new_df[col].value_counts(normalize=True) * 100
        plt.figure(figsize=(10, 4))
        sns.countplot(y=col, data=new_df.dropna(subset=col))
        plt.title(f'Distribution of {col}')
        plt.show()
    print(summary)


def visualise_continuous_correlations():
    new_df = full_df.dropna(subset=num_cols, how='any')
    plt.figure(figsize=(10, 4))
    sns.heatmap(new_df[num_cols].corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()

def visualise_cat_vs_cont(cat_var, cont_var):
    new_df = full_df.dropna(subset=[cat_var, cont_var], how='any')
    grouped = new_df.groupby(cat_var)[cont_var].agg(['mean', 'count'])
    total_count = new_df[cat_var].count()
    grouped['proportion'] = grouped['count'] / total_count
    print(f'{grouped}')
    plt.figure(figsize=(8, 16))
    sns.violinplot(x=cat_var, y=cont_var, data=new_df)
    plt.title(f'Distribution of {cont_var} by {cat_var}')
    plt.show()

def contingency_table(var1, var2):
    new_df = full_df.dropna(subset=[var1, var2])
    cont_table = pd.crosstab(new_df[var1], new_df[var2])
    plt.figure(figsize=(8, 16))
    sns.heatmap(cont_table, annot=True, cmap='viridis', fmt='g')
    plt.title('Contingency Heatmap')
    plt.xlabel(var2)
    plt.ylabel(var1)
    plt.show()



In [None]:

def visualise_clusters(X_pca, clusters):
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette='viridis')
    plt.title('Data points in PCA reduced space')
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.show()

def visualise_var_across_clusters(var, data):
    means = df_cleaned.groupby('cluster')[var].mean()
    print('Means for each cluster: ', means)
    plt.figure(figsize=(8, 6))
    sns.violinplot(x='cluster', y=var, data=data)
    plt.title(f'{var} distribution per cluster')
    plt.show()

def explained_variance(pca):
    plt.figure(figsize=(8, 6))
    plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.show()


In [None]:

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

target = ['MAT_SMOKING_NC', 'BIRTH_WEIGHT_NC', 'GEST_AGE_NC', 'MAT_AGE_NC', 'APGAR_1_NC', 'BIRTH_ORDER_NC', 'PREV_STILLBIRTH_NC', 'CHILD_ETHNIC_GRP_NC', 'CHILD_SEX_NC']
df_cleaned = df[target].dropna().loc[df['MAT_SMOKING_NC'] != 9].loc[df['LABOUR_ONSET_NC'] != 9]
df_cleaned = df_cleaned[df_cleaned['CHILD_ETHNIC_GRP_NC'] != 'Z']
df_cleaned = df_cleaned.apply(lambda col: col.astype('category').cat.codes if col.dtype == 'object' or col.dtype == 'string' else col).astype('int64')

X = StandardScaler().fit_transform(df_cleaned)

pca1 = PCA().fit(X)
explained_variance(pca1)

pca = PCA(n_components=7)
X_pca = pca.fit_transform(X)
print(f'Explained variance ratio: {pca.explained_variance_ratio_}')

kmeans = KMeans(n_clusters=5)
clusters = kmeans.fit_predict(X_pca)

components = pd.DataFrame(pca.components_, columns=df_cleaned.columns)
sns.heatmap(components, cmap='viridis', annot=True)
plt.show()

df_cleaned['cluster'] = clusters

for var in target:
    visualise_var_across_clusters(var, df_cleaned)