In [232]:
# load required packages 

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os 
import seaborn as sns

#for maps
import folium
from folium.plugins import HeatMap

#for imputation
import sklearn
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
# load metadata data
metadata = pd.read_csv("public_dataset/metadata_compiled.csv")
metadata.head()

In [None]:
# EDA 

sns.histplot(metadata['age'].dropna(), bins=30, kde=True, color='skyblue')
plt.title('Histogram of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')

plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=metadata, palette='viridis')

plt.figure(figsize=(8, 6))
sns.countplot(x='respiratory_condition', data=metadata, palette='coolwarm')

plt.figure(figsize=(8, 6))
sns.countplot(x='fever_muscle_pain', data=metadata, palette='coolwarm')

plt.figure(figsize=(8, 6))
sns.countplot(x='status', data=metadata, palette='coolwarm')

plt.figure(figsize=(8, 6))
sns.countplot(x='cough_type_4', data=metadata, palette='coolwarm')





In [None]:
# concat physician data from different ppl
## this code excludes individuals who dont have physician annotated coughs 
### individuals who have been assessed by more than one physician, it only includes the last person to assess them 

def reshape_metadata(metadata):
    rows = [] 
    for _, row in metadata.iterrows():
        for i in range(1, 5):  
            if pd.notna(row[f'quality_{i}']):  
                new_row = {
                    'uuid': row['uuid'],
                    'datetime': row['datetime'],
                    'cough_detected': row['cough_detected'],
                    'SNR': row['SNR'],
                    'latitude': row['latitude'],
                    'longitude': row['longitude'],
                    'age': row['age'],
                    'gender': row['gender'],
                    'respiratory_condition': row['respiratory_condition'],
                    'fever_muscle_pain': row['fever_muscle_pain'],
                    'status': row['status'],
                    'quality': row[f'quality_{i}'],
                    'cough_type': row[f'cough_type_{i}'],
                    'dyspnea': row[f'dyspnea_{i}'],
                    'wheezing': row[f'wheezing_{i}'],
                    'stridor': row[f'stridor_{i}'],
                    'choking': row[f'choking_{i}'],
                    'congestion': row[f'congestion_{i}'],
                    'nothing': row[f'nothing_{i}'],
                    'diagnosis': row[f'diagnosis_{i}'],
                    'severity': row[f'severity_{i}'],
                    'physician_id': i
                }
                rows.append(new_row)

    metadata_new = pd.DataFrame(rows)
    metadata_new.fillna('unknown', inplace=True)
    return metadata_new

metadata_new = reshape_metadata(metadata)
print(metadata_new.head())
print(metadata_new.shape)


In [None]:
#make sure variable types are correcct 

metadata_new['age'] = pd.to_numeric(metadata_new['age'], errors='coerce')
metadata_new['latitude'] = pd.to_numeric(metadata_new['latitude'], errors='coerce')
metadata_new['longitude'] = pd.to_numeric(metadata_new['longitude'], errors='coerce')

metadata_new['datetime'] = pd.to_datetime(metadata_new['datetime'], errors='coerce')
metadata_new['datetime'] = metadata_new['datetime'].dt.date

metadata_new['physician_id'] = metadata_new['physician_id'].astype('object')
metadata_new['dyspnea'] = metadata_new['dyspnea'].astype('object')
metadata_new['wheezing'] = metadata_new['wheezing'].astype('object')
metadata_new['stridor'] = metadata_new['stridor'].astype('object')
metadata_new['choking'] = metadata_new['choking'].astype('object')
metadata_new['congestion'] = metadata_new['congestion'].astype('object')
metadata_new['nothing'] = metadata_new['nothing'].astype('object')

metadata_new.dtypes


In [None]:
#view again to see date format
print(metadata_new.shape)
metadata_new.head()


In [None]:
#mapping out dataset only for ppl that have longitude and latitude data, purely descriptive not for analysis. 

metadata_map = metadata_new.dropna(subset=['latitude', 'longitude'])

map_center = [metadata_map['latitude'].mean(), metadata_map['longitude'].mean()]
map = folium.Map(location=map_center, zoom_start=3)

heat_data = metadata_map[['latitude', 'longitude']].values.tolist()
HeatMap(heat_data, radius=10, blur=15, max_zoom=1).add_to(map)

In [None]:
# missing data anlaysis, dont need to impute longitude and latitude as will remove from analysis, but will impute age

missing_percentage = metadata_new.isnull().mean() * 100

plt.figure(figsize=(12, 6))
missing_percentage.sort_values(ascending=False).plot(kind='bar', color='skyblue')
plt.title("Missing Values Percentage by Column")
plt.ylabel("Percentage of Missing Data")
plt.show()


In [None]:
# recode variables and one hot encode all of them before imputing 

## replace unknowns with NAs

metadata_new['respiratory_condition'] = metadata_new['respiratory_condition'].replace('unknown', np.nan)
metadata_new['fever_muscle_pain'] = metadata_new['fever_muscle_pain'].replace('unknown', np.nan)
metadata_new['gender'] = metadata_new['gender'].replace('unknown', np.nan)
metadata_new['cough_type'] = metadata_new['cough_type'].replace('unknown', np.nan)
metadata_new['status'] = metadata_new['status'].replace('unknown', np.nan)
metadata_new['quality'] = metadata_new['quality'].replace('unknown', np.nan)
metadata_new['dyspnea'] = metadata_new['dyspnea'].replace('unknown', np.nan)
metadata_new['wheezing'] = metadata_new['wheezing'].replace('unknown', np.nan)
metadata_new['stridor'] = metadata_new['stridor'].replace('unknown', np.nan)
metadata_new['choking'] = metadata_new['choking'].replace('unknown', np.nan)
metadata_new['congestion'] = metadata_new['congestion'].replace('unknown', np.nan)
metadata_new['diagnosis'] = metadata_new['diagnosis'].replace('unknown', np.nan)
metadata_new['severity'] = metadata_new['severity'].replace('unknown', np.nan)
metadata_new['physician_id'] = metadata_new['physician_id'].replace('unknown', np.nan)

# now redo NA plot

missing_percentage = metadata_new.isnull().mean() * 100

plt.figure(figsize=(12, 6))
missing_percentage.sort_values(ascending=False).plot(kind='bar', color='skyblue')
plt.title("Missing Values Percentage by Column")
plt.ylabel("Percentage of Missing Data")
plt.show()

In [None]:
# variable recoding continued 

# Recode gender M = 0/F = 1 and remove people with gender = 'other' and make gender an int variable type

metadata_new['gender'] = metadata_new['gender'].replace({'male': 0, 'female': 1, 'other': 2})
metadata_new = metadata_new[metadata_new['gender'] != 'other']
metadata_new['gender'] = metadata_new['gender'].astype('Int64')


#Rename the nothing column 
metadata_new = metadata_new.rename(columns={'nothing': 'nothing_specific'})

#For true/false variables 
binary_columns = ['respiratory_condition', 'fever_muscle_pain', 'dyspnea', 'wheezing', 'stridor', 'choking', 'congestion', 'nothing_specific']

for col in binary_columns:
    metadata_new[col] = metadata_new[col].replace({False: 0, True: 1})

#convert certain vars to integer - cough detected, resp condition, fever muscle pain

columns_convert = ['cough_detected', 'resp_condition', 'fever_muscle_pain']

for col in columns_convert:
    if col in metadata_new.columns:
        metadata_new[col] = metadata_new[col].astype('Int64')

print(metadata_new.dtypes)
print(metadata_new.head())

print(metadata_new)



In [None]:
#one hot encode multilevel variables 

encode_cols = ['physician_id', 'severity', 'diagnosis', 'status', 'quality', 'cough_type']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_df = encoder.fit_transform(metadata_new[encode_cols])

encoded_df = pd.DataFrame(encoded_df, columns=encoder.get_feature_names_out(encode_cols))
metadata_new_encoded = pd.concat([metadata_new, encoded_df], axis=1)

#remove their OG columns from the dataset
metadata_new_encoded = metadata_new_encoded.drop(columns=encode_cols)

#convert float variables to int

cols_to_convert = ['cough_detected', 'respiratory_condition', 'fever_muscle_pain']
metadata_new_encoded[cols_to_convert] = metadata_new_encoded[cols_to_convert].apply(pd.to_numeric, errors='ignore', downcast='integer')

#remove longitude and latitude first as this is not relevant before imputing

metadata_new_encoded = metadata_new_encoded.drop(metadata_new_encoded.columns[[4, 5]], axis=1)

print(metadata_new_encoded.dtypes)
print(metadata_new_encoded.head())


# plot missingness to see if it worked, missing values are represented as all 0s for a one hot encoded variable - CHECK THIS 
missing_percentage = metadata_new_encoded.isnull().mean() * 100

plt.figure(figsize=(12, 6))
missing_percentage.sort_values(ascending=False).plot(kind='bar', color='skyblue')
plt.title("Missing Values Percentage by Column")
plt.ylabel("Percentage of Missing Data")
plt.show()




In [None]:
# standardise numeric vars before imputation
numeric_cols = ['age', 'SNR'] 

scaler = StandardScaler()  
metadata_new_encoded[numeric_cols] = scaler.fit_transform(metadata_new_encoded[numeric_cols])

#make sure all columns have been converted to numeric
print(metadata_new_encoded.dtypes)

#remove non numeric columns to add back after 

uuid_and_datetime = metadata_new_encoded[['uuid', 'datetime']]
metadata_without_uuid_datetime = metadata_new_encoded.drop(columns=['uuid', 'datetime'])

knn_imputer = KNNImputer(n_neighbors=5)
imputed_data = knn_imputer.fit_transform(metadata_without_uuid_datetime)
imputed_df = pd.DataFrame(imputed_data, columns=metadata_without_uuid_datetime.columns)
metadata_preprocessed = pd.concat([uuid_and_datetime, imputed_df], axis=1)

print(metadata_preprocessed.head())



In [None]:
# save file 
print(metadata_preprocessed.shape)

metadata_preprocessed.set_index('uuid', inplace=True)
metadata_preprocessed.to_csv('/Users/danrose/MSc/ML/ML_CW/metadata_preprocessed.csv', index=True)



## DO EXPLORATORY DATA ANALYSIS HERE

