# Predicting Total Knee Athroplasty (TKA) 

In [10]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as stat
import re

In [3]:
# Loading the training data
df = pd.read_excel("/workspaces/ML-Engineering/Predicting_Total_Knee_Arthroplasty(TKA)/Data/Testing Data Set- Clean.xlsx")

### Data understanding

In [5]:
df.head()

Unnamed: 0,age,right,"Demographics.Gender (0=Female, 1=Male)",Procedure_TKR.BMI,Procedure_TKR.Weight_kg,Procedure_TKR.Height_cm,Femoral Model,Femur implanted,Tibial Model,Tibia Implanted
0,59.0,1,0.0,30.13,95.3,177.8,SN Legion Size 6,66.0,Genesis II size 5,74.0
1,42.0,0,1.0,32.49,100.2,173.2,SN Legion Size 7,70.0,Genesis II size 6,77.0
2,71.0,0,0.0,34.0,98.4,170.2,Stryker Triathlon Femoral 5,65.0,Stryker Triathlon Tibial 5,74.0
3,83.0,0,0.0,24.27,62.1,160.0,SN Legion Size 4,59.0,Genesis II size 2,64.0
4,56.0,1,1.0,49.51,161.0,180.3,Stryker Triathlon Femoral 8,75.0,Stryker Triathlon Tibial 7,80.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 513 entries, 0 to 512
Data columns (total 10 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   age                                     495 non-null    float64
 1   right                                   496 non-null    object 
 2   Demographics.Gender (0=Female, 1=Male)  496 non-null    float64
 3   Procedure_TKR.BMI                       490 non-null    float64
 4   Procedure_TKR.Weight_kg                 491 non-null    float64
 5   Procedure_TKR.Height_cm                 491 non-null    float64
 6   Femoral Model                           491 non-null    object 
 7   Femur implanted                         491 non-null    float64
 8   Tibial Model                            491 non-null    object 
 9   Tibia Implanted                         488 non-null    float64
dtypes: float64(7), object(3)
memory usage: 40.2+ KB


In [9]:
df.columns

Index(['age', 'right', 'Demographics.Gender (0=Female, 1=Male)',
       'Procedure_TKR.BMI', 'Procedure_TKR.Weight_kg',
       'Procedure_TKR.Height_cm', 'Femoral Model', 'Femur implanted',
       'Tibial Model', 'Tibia Implanted'],
      dtype='object')

In [16]:
# checkinng for missing data
df.isnull().sum()

age                18
right              17
gender             17
bmi                23
weight_kg          22
height_cm          22
femoral_model      22
femur_implanted    22
tibial_model       22
tibia_implanted    25
dtype: int64

In [17]:
df[df.isnull().any(axis=1)]

Unnamed: 0,age,right,gender,bmi,weight_kg,height_cm,femoral_model,femur_implanted,tibial_model,tibia_implanted
37,,,,,,,,,,
111,,,,,,,,,,
122,,,,,,,,,,
161,,0.0,0.0,31.539,80.7,160.0,Zimmer NexGen CR-Flex GSF size D,56.4,Nexgen Tibia Size 4,66.0
162,,1.0,0.0,32.779,83.9,160.0,SN Legion size 5,63.0,Genesis II size 3,68.0
225,,,,,,,,,,
226,,,,,,,,,,
227,,,,,,,,,,
272,,,,,,,,,,
286,,,,,,,,,,


In [20]:
# drop rows where there are many null columns
df.dropna(thresh=(len(df.columns)-5), inplace=True)
df[df.isnull().any(axis=1)]

Unnamed: 0,age,right,gender,bmi,weight_kg,height_cm,femoral_model,femur_implanted,tibial_model,tibia_implanted
161,,0,0.0,31.539,80.7,160.0,Zimmer NexGen CR-Flex GSF size D,56.4,Nexgen Tibia Size 4,66.0
162,,1,0.0,32.779,83.9,160.0,SN Legion size 5,63.0,Genesis II size 3,68.0
409,57.0,1,0.0,35.537,75.8,146.0,DJO EMPOWR size 3,54.0,EMPOWR 3-,
453,71.0,1,0.0,,62.2,152.4,Vanguard XP 62.5,62.5,67 cemented I-beam tibia,67.0
482,68.0,0,0.0,48.501,105.2,147.3,DJO EMPOWR 3,54.0,EMPOWR 3-,
487,59.0,1,0.0,39.542,117.9,172.7,DJO EMPOWR 7,64.0,EMPOWR size 7,


### Data wrangling

In [12]:
def wrangle(path):
    # read data
    df=pd.read_excel(path)
    
    # standardize column names
    columns = df.columns
    cleaned_cols=[]
    # Iterate through each column name
    for col in columns:
        # remove unnecessary part
        col=col.rsplit('.', 1)[-1]
        col=re.sub(r'\s*\(.*?\)', '', col)
        # standardizing: replace spaces with underscores and convert to lowercase
        clean_col=col.replace(' ', '_').lower()
        cleaned_cols.append(clean_col)
    #clean column names
    df.columns = cleaned_cols 


    return df
    

In [13]:
path = "/workspaces/ML-Engineering/Predicting_Total_Knee_Arthroplasty(TKA)/Data/Testing Data Set- Clean.xlsx"
train_df = wrangle(path)
train_df.head()

Unnamed: 0,age,right,gender,bmi,weight_kg,height_cm,femoral_model,femur_implanted,tibial_model,tibia_implanted
0,59.0,1,0.0,30.13,95.3,177.8,SN Legion Size 6,66.0,Genesis II size 5,74.0
1,42.0,0,1.0,32.49,100.2,173.2,SN Legion Size 7,70.0,Genesis II size 6,77.0
2,71.0,0,0.0,34.0,98.4,170.2,Stryker Triathlon Femoral 5,65.0,Stryker Triathlon Tibial 5,74.0
3,83.0,0,0.0,24.27,62.1,160.0,SN Legion Size 4,59.0,Genesis II size 2,64.0
4,56.0,1,1.0,49.51,161.0,180.3,Stryker Triathlon Femoral 8,75.0,Stryker Triathlon Tibial 7,80.0
