In [None]:
# Loading libraries and packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df= pd.read_csv('aclr.csv')
df.head()

# DATA CLEANING

In [None]:
df.columns[df.isnull().all()]# these are columns that have no data ( we need to drop)

In [None]:
df.drop(columns=['redcap_repeat_instrument', 'tss_tegner'], inplace=True)

In [None]:
df.info()

In [None]:
df.isnull().sum() # looking for missing data

In [None]:
missing_percent = df.isnull().sum() / len(df) * 100
print(missing_percent.nsmallest(10))

# Seeing values to make the nan as Not reported

In [None]:
print(df["sex_dashboard"].unique())
print(df['visit_sex'].unique())
print(df['age_group_dashboard_use'].unique())
print(df['graft_dashboard2'].unique())
print(df['visit_graft'].unique())

In [None]:
print(df['redcap_event_name'].unique())

In [None]:
print(df['strength_testing_complete'].unique())

In [None]:
print(df['med_meniscus'].unique())
print(df['lat_meniscus'].unique())

1. Gender, Age Groups,  ( make the nan as Not reported) ( input)

In [None]:
df['age_group_dashboard_use'] = df['age_group_dashboard_use'].fillna('Not Reported')
df['sex_dashboard'] = df['sex_dashboard'].fillna('Not Reported')
df['visit_sex'] = df['visit_sex'].fillna('Not Reported')

In [None]:
# I've also included a visual aid of how many missing values we have for each variable
# Bar plot of missing values for each column
missing_vals = df.isnull().sum()
missing_vals = missing_vals[missing_vals > 0] # making sure to inlcude only the missing values ( no variables with complete)
missing_vals.sort_values(inplace=True)

plt.figure(figsize=(14, 10))
missing_vals.plot(kind='barh', color='lightcoral') # making this horizontal bargraph to enhance readability
plt.title('Missing Values Visualized', fontsize=20)# let's make sure the title is big
plt.xlabel('# of Missing Values ', fontsize=10)
plt.ylabel('Columns', fontsize=10)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.gca().invert_yaxis() # we need this code to ensure that we have the missing values inverted
plt.show()

In [None]:
# This leads us to see the distribution of the data and see outliers present ( use boxplot since they do a good job detecting/visualizing outliers)
plt.figure(figsize=(12, 12))
sns.boxplot(data=df, orient="h")
plt.title("Outlier present")
plt.show()

In [None]:
# Let's split the categorical and numerical columns
categorical_columns = df.select_dtypes(include=['object']).columns # this includes strings
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns # these are numbers with meaning

# Now let's impute categorical(mode) and numerical columns (median)
df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].median())

In [None]:
df.isnull().sum()

In [None]:
# distribtion of grafttype by age groups

plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='graft_dashboard2', hue='age_group_dashboard_use')
plt.title("Distribution by Age Group")
plt.show()

In [None]:
# distrubtion of grafttype by sex
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='age_group_dashboard_use', hue='visit_sex')
plt.title("Distribution of Age by Sex")
plt.show()

In [None]:
print(df['tss_dashboard'].unique()) # time since surgery

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='koos_pain', hue='tss_dashboard', multiple='stack')  # or 'dodge'
plt.title("Distribution of KOOS Pain Scores by Time Since Surgery")
plt.xlabel("KOOS Pain Score")
plt.ylabel("Count")
plt.legend(df['tss_dashboard'])
plt.show()


In [None]:
# visualizing the distirbution of strength testing, tss ( multiple)

fig,axs = plt.subplots(3,3, figsize=(20, 15))

sns.histplot(data=df, x='acl_sh', hue= 'tss_dashboard', multiple='stack', ax=axs[0,0], legend=False)
axs[0,0].set_title("ACL Strength")

sns.histplot(data=df, x='acl_th', hue= 'tss_dashboard', multiple='stack', ax=axs[0,1], legend=False)

sns.histplot(data=df, x='acl_ch', hue= 'tss_dashboard', multiple='stack', ax=axs[0,2], legend=False)

sns.histplot(data=df, x='acl_ext_mvic_90', hue= 'tss_dashboard', multiple='stack', ax=axs[1,0], legend=False)

sns.histplot(data=df, x='acl_ext_mvic_60', hue= 'tss_dashboard', multiple='stack', ax=axs[1,1], legend=False)

sns.histplot(data=df, x='acl_flex_mvic_60', hue= 'tss_dashboard', multiple='stack', ax=axs[1,2], legend=False)

sns.histplot(data=df, x='acl_ext_isok_60', hue= 'tss_dashboard', multiple='stack', ax=axs[2,0], legend=False)

sns.histplot(data=df, x='acl_flex_isok_60', hue= 'tss_dashboard', multiple='stack', ax=axs[2,1])

plt.show()