<a href="https://colab.research.google.com/github/LBncl/AmazonMLInterviewQuestion/blob/main/NAFLD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NAFLD PROJECT

## Imports

In [286]:
# Imports
import missingno as msno
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Data Cleaning

In [350]:
# Read in data
df_raw = pd.read_excel('master_with_nordic_and_multiBM.xlsx')

# Check if dataFrame is empty
if df_raw.empty:
    print('DataFrame is empty!')

In [433]:
# Subset data frame
main_df = df_raw.filter(items=['CPH_EV_AGE_CALC', 'TBL.PATIENT.INFO..PI_BL_GENDER', 'CPH_EV_CI_BMI_CALC', 
                         'TBL.ALL.EVENTS..AE_SF_ALCO_XS', 'insulin_resistance', 'hypertensive', 'waist_to_hip_ratio',
                         'idf_metabolic_syndrome', 'eGFR', 'dyslipidaemia', 'fibroscan_stiffness_reliable',
                         'TBL.ALL.EVENTS..AE_BR_ALT_iuL',
                         'TBL.ALL.EVENTS..AE_BR_AST_iuL', 'TBL.ALL.EVENTS..AE_BR_GGT_iuL',
                         'TBL.ALL.EVENTS..AE_BR_FERR_ugL',
                         'TBL.ALL.EVENTS..AE_BR_PLT_109L', 'TBL.ALL.EVENTS..AE_BR_CREAT_umolL_CALC',
                         'TBL.ALL.EVENTS..AE_BR_STG_mmolL_CALC',
                         'TBL.ALL.EVENTS..AE_BR_ALBU_gL_CALC', 'TBL.ALL.EVENTS..AE_BR_BILI_umolL_CALC',
                         'TBL.ALL.EVENTS..AE_BR_IGA',
                         'TBL.ALL.EVENTS..AE_CD_OSA', 'LIT_NB_CK18_M30', 'LIT_NB_CK18_M65', 'LIT_NB_PRO_C3', 'LIT_NB_PRO_C6',
                          'LIT_NB_ELF', 'FIB4', 'NFS', 'APRI', 'ADAPT', 'FIBC3', 'ABC3D', 'BARD', 'AST_ALT_Ratio', 'response_3b','CPH_EV_EVENT_TYPE'])

In [None]:
main_df.describe()

In [None]:
main_df.dtypes

In [None]:
msno.bar(main_df)

In [434]:
# Remove duplicates
print(main_df.shape)
main_df = main_df.drop_duplicates()
print(main_df.shape)

(14236, 37)
(13772, 37)


In [435]:
# Convert required variables to boolean 
features_raw = main_df

# Loop to check for boolean columns
for column in features_raw.iloc[:, 0:36]:
  if features_raw[column].max() == 1 and features_raw[column].min() == 0:
    features_raw[column] = features_raw[column].astype(bool)

In [None]:
features_raw.dtypes

In [None]:
features_raw

In [442]:
# Filter to only select baseline event types
print(main_df.shape)
features_raw['CPH_EV_EVENT_TYPE'] = features_raw['CPH_EV_EVENT_TYPE'].astype(str)
features_raw = features_raw.loc[features_raw['CPH_EV_EVENT_TYPE'] == 'Baseline']
print(features_raw.shape)

(13772, 37)
(8991, 37)


In [None]:
features_raw

In [443]:
# Remove baseline column
features_raw = features_raw.iloc[:, 0:36]

In [444]:
# Remove rows where there is no response value and set value to Bool
print(features_raw.shape)
print(features_raw['response_3b'].isna().sum())
features_raw = features_raw[features_raw['response_3b'].notna()]
print(features_raw.shape)

(8991, 36)
0
(8991, 36)


In [445]:
features_baseline = features_raw

In [None]:
features_baseline.plot(kind="box", subplots=True, layout=(9,4), figsize=(50,50))

In [446]:
features_baseline.dtypes

CPH_EV_AGE_CALC                           float64
TBL.PATIENT.INFO..PI_BL_GENDER               bool
CPH_EV_CI_BMI_CALC                        float64
TBL.ALL.EVENTS..AE_SF_ALCO_XS                bool
insulin_resistance                           bool
hypertensive                                 bool
waist_to_hip_ratio                        float64
idf_metabolic_syndrome                       bool
eGFR                                      float64
dyslipidaemia                                bool
fibroscan_stiffness_reliable              float64
TBL.ALL.EVENTS..AE_BR_ALT_iuL             float64
TBL.ALL.EVENTS..AE_BR_AST_iuL             float64
TBL.ALL.EVENTS..AE_BR_GGT_iuL             float64
TBL.ALL.EVENTS..AE_BR_FERR_ugL            float64
TBL.ALL.EVENTS..AE_BR_PLT_109L            float64
TBL.ALL.EVENTS..AE_BR_CREAT_umolL_CALC    float64
TBL.ALL.EVENTS..AE_BR_STG_mmolL_CALC      float64
TBL.ALL.EVENTS..AE_BR_ALBU_gL_CALC        float64
TBL.ALL.EVENTS..AE_BR_BILI_umolL_CALC     float64


In [447]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

cols_normalise = features_baseline.select_dtypes(include=[np.float64])

# Normalise Data
scale = StandardScaler()
features_baseline[cols_normalise.columns] = scale.fit_transform(cols_normalise)

In [None]:
features_baseline

## Data Imputation Strategies

In [313]:
# Imports for imputation functions
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression

In [15]:
# Single imputation with mean to replace not a number (NaNs)
def nan2mean(fdf):
    cols = list(fdf.columns)
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    fdf=imp.fit_transform(fdf)
    fdf = pd.DataFrame(fdf, columns=cols)
    return fdf

In [16]:
# Single imputation with median to replace not a number (NaNs)
def nan2median(fdf):
    cols = list(fdf.columns)
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    fdf=imp.fit_transform(fdf)
    fdf = pd.DataFrame(fdf, columns=cols)
    return fdf

In [17]:
# Single imputation with most_frequent to replace not a number (NaNs)
def nan2most_frequent(fdf):
    cols = list(fdf.columns)
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    fdf=imp.fit_transform(fdf)
    fdf = pd.DataFrame(fdf, columns=cols)
    return fdf

In [18]:
# Multiple Imputation by chained equation linear regression
def nan2mice(fdf):
    cols = list(fdf.columns)
    lr = LinearRegression()
    imp = IterativeImputer(estimator=lr,missing_values=np.nan, max_iter=50, imputation_order='roman',random_state=0)
    fdf=imp.fit_transform(fdf)
    fdf = pd.DataFrame(fdf, columns=cols)
    return fdf

In [19]:
# Imputation by K neasrest neighbours
def nan2knn(fdf):
    cols = list(fdf.columns)
    imp = KNNImputer(n_neighbors=2, weights="distance")
    fdf=imp.fit_transform(fdf)
    fdf = pd.DataFrame(fdf, columns=cols)
    return fdf

## Model evaluation

In [449]:
# Create new imputed datasets
featuresKNN = nan2knn(features_baseline)
featuresMean = nan2mean(features_baseline)
featuresMedian = nan2median(features_baseline)
featuresMICE = nan2mice(features_baseline)
featuresMostFrq = nan2most_frequent(features_baseline)
featuresMostFrq = featuresMostFrq.apply(pd.to_numeric)

# Create list of all imputation strategies
imputed_features = [featuresKNN,featuresMean,featuresMedian,featuresMICE,featuresMostFrq]
imputed_features_names = ['featuresKNN','featuresMean','featuresMedian','featuresMICE','featuresMostFrq']



In [None]:
featuresKNN

In [None]:
column_name = "CPH_EV_AGE_CALC"

d = {'raw': features_baseline[column_name], 
     'KNN Imputation': featuresKNN[column_name],
     'Mean Imputation': featuresMean[column_name],
     'Median Imputation': featuresMedian[column_name],
     'MICE Imputation': featuresMICE[column_name],
     'MostFrq Imputation': featuresMostFrq[column_name]}
fdata = pd.DataFrame(data=d)

# calling density() to make multiple density plot 
fig, axes = plt.subplots(1, 6)
fig.suptitle('Raw vs Imputed Desity Plots')
fdata['raw'].plot.density(ax=axes[0], figsize = (15, 7))
fdata['Mean Imputation'].plot.density(ax=axes[1], figsize = (15, 7), color='red')
fdata['Median Imputation'].plot.density(ax=axes[2], figsize = (15, 7), color='red')
fdata['MICE Imputation'].plot.density(ax=axes[3], figsize = (15, 7), color='red')
fdata['MostFrq Imputation'].plot.density(ax=axes[4], figsize = (15, 7), color='red')
fdata['KNN Imputation'].plot.density(ax=axes[5], figsize = (15, 7), color='red')

axes[0].set_title('Raw')
axes[1].set_title('Mean Imputed')
axes[2].set_title('Median Imputed')
axes[3].set_title('MICE Imputed')
axes[4].set_title('MostFrq Imputed')
axes[5].set_title('KNN Imputed')

axes[0].set_xlabel(column_name)
axes[1].set_xlabel(column_name)
axes[2].set_xlabel(column_name)
axes[3].set_xlabel(column_name)
axes[4].set_xlabel(column_name)
axes[5].set_xlabel(column_name)

## Outlier Identifcation

# Feature Selection

In [None]:
pip install featurewiz

In [None]:
# automatic feature selection by using featurewiz package
from featurewiz import featurewiz
target = 'response'

features_selected, train = featurewiz(featuresKNN, target, corr_limit=0.7, verbose=2, sep=",", header=0,test_data="", feature_engg="", category_encoders="")

In [None]:
from sklearn.model_selection import train_test_split

# function to generate test/train split
def train_test (fdf):
  # Split data into test and trian
  X_train, X_test, y_train, y_test = train_test_split(fdf.iloc[:, 0:35], fdf.iloc[: , -1], test_size=0.30, random_state=0)
  return X_train, X_test, y_train, y_test

In [None]:
# Filter all data frames based upon best features
featuresKNN = featuresKNN[features_selected]
featuresMean = featuresMean[features_selected]
featuresMedian = featuresMedian[features_selected]
featuresMICE = featuresMICE[features_selected]
featuresMostFrq = featuresMostFrq[features_selected]

imputed_features = [featuresKNN,featuresMean,featuresMedian,featuresMICE,featuresMostFrq]

# Add response variable to all imputed datasets
for i in range(len(imputed_features)):
  df = imputed_features[i]
  df['response'] = response

In [None]:
featuresKNN

# SMOTE

In [132]:
# check version number
import imblearn
print(imblearn.__version__)

0.8.1


In [133]:
import collections
# summarize class distributions for each imputed data set
for df in imputed_features:
  counter = collections.Counter(df['response'])
  print(counter)

Counter({1.0: 3403, 0.0: 1305})
Counter({1.0: 3403, 0.0: 1305})
Counter({1.0: 3403, 0.0: 1305})
Counter({1.0: 3403, 0.0: 1305})
Counter({1.0: 3403, 0.0: 1305})


In [40]:
# transform the dataset
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

over = SMOTE()
under = RandomUnderSampler()
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

X , y = pipeline.fit_resample(featuresMICE.iloc[:, 0:35] , featuresMICE.iloc[: , -1])

In [41]:
# summarize the new class distribution
counter = collections.Counter(y)
print(counter)

Counter({0.0: 3403, 1.0: 3403})


### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler

# Logistic Regression model
# Implement the model
logreg = LogisticRegression()

# hyperparameters, penalty values chosen to work with all solvers
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=logreg, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)

# SMOTE
over = SMOTE(sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=1)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

for i in range(len(imputed_features)):
  df = imputed_features[i]
  X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:35], df['response_3b'], test_size=0.2, random_state=42)
  
  X_train, y_train = pipeline.fit_resample(X_train, y_train)

  grid_result = grid_search.fit(X_train, y_train)
  grid_predict = grid_result.predict(X_test)

  # summarize results
  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_), imputed_features_names[i])
  print(confusion_matrix(y_test, grid_predict))
  print(classification_report(y_test, grid_predict))

### SVM

In [None]:
# fit a svm on an imbalanced classification dataset
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC

# define model
model = SVC(gamma='scale')
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

for df in imputed_features:
  X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:35], df['response_3b'], test_size=0.2, random_state=0)
  # evaluate model
  scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
  # summarize performance
  print('Mean ROC AUC: %.3f' % mean(scores))



In [None]:
# define model
model = SVC(gamma='scale')
# define grid
balance = [{0:100,1:1}, {0:10,1:1}, {0:1,1:1}, {0:1,1:10}, {0:1,1:100}]
param_grid = dict(class_weight=balance)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')

for df in imputed_features:
  X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:35], df['response'], test_size=0.2, random_state=0)
  # execute the grid search
  grid_result = grid.fit(X_train, y_train)
  # report the best configuration
  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
  # report all configurations
  means = grid_result.cv_results_['mean_test_score']
  stds = grid_result.cv_results_['std_test_score']
  params = grid_result.cv_results_['params']
  for mean, stdev, param in zip(means, stds, params):
      print("%f (%f) with: %r" % (mean, stdev, param))

### XGBoost 

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

# SMOTE
over = SMOTE(sampling_strategy=0.35)
under = RandomUnderSampler(sampling_strategy=1)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# Hyperparameters
hyperparameter_grid = {
    'n_estimators': [100, 500, 900, 1100, 1500],
    'max_depth': [2, 3, 5, 10, 15],
    'learning_rate': [0.05, 0.1, 0.15, 0.20],
    'min_child_weight': [1, 2, 3, 4]
    }

model = XGBClassifier()

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=hyperparameter_grid, n_jobs=-1, cv=cv, scoring='roc_auc',error_score=0)

for df in imputed_features:
  X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:35], df['response'], test_size=0.2, random_state=0)
  X_train, y_train = pipeline.fit_resample(X_train, y_train)

  grid_result = grid_search.fit(X_train, y_train)
  grid_predict = grid_search.predict(X_test)

  # summarize results
  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
  print(confusion_matrix(y_test, grid_predict))
  print(classification_report(y_test, grid_predict))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

def model_RF(fdf, response):
  # Pull the correct resposne variable from the dataframe
  y = main_df[response]
  # Remove all rows with missing observations
  X = fdf
  X['y'] = y
  X = fdf.dropna()
  # Split data into test and trian
  X_train, X_test, y_train, y_test = train_test_split(X.iloc[:, 0:35], X.iloc[: , -1], test_size=0.20, random_state=0)
  # Implement the model
  regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
  regressor.fit(X_train, y_train)
  y_pred = regressor.predict(X_test)

  #use model to predict probability that given y value is 1
  y_pred_proba = regressor.predict_proba(X_test)[::,1]

  #calculate AUC of model
  auc = metrics.roc_auc_score(y_test, y_pred_proba)

  #print AUC score
  print(auc)

  # Calculate F-score
  report = classification_report(y_test, y_pred)
  print(report)

  return y_pred

## Model Imports

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics

# Pipeline

## Imputation and Target Variable Selection

In [None]:
compare_denisty("CPH_EV_AGE_CALC",featuresKNN)

# New workflow

## Split

In [None]:
# Filter to only select basline event types
main_df['CPH_EV_EVENT_TYPE'] = df_raw['CPH_EV_EVENT_TYPE']
main_df_base = main_df[main_df['CPH_EV_EVENT_TYPE'] == 'Baseline']
main_df_base = main_df_base.iloc[: , :-1]

In [None]:
# Loop to check for boolean columns
for column in main_df_base:
  if main_df_base[column].max() == 1 and main_df_base[column].min() == 0:
    main_df_base[column] = main_df_base[column].astype(bool)

In [None]:
main_df_base.dtypes

In [None]:
X_train, X_test, y_train, y_test = train_test_split(main_df_base.iloc[:, 0:35], main_df_base['response_3a'], test_size=0.33, random_state=42)

## Imputing

In [None]:
X_train_imputed = nan2mice(X_train)
X_test_imputed = nan2mice(X_test)

## Scaling 

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler

rs = RobustScaler()

to_scale_train = X_train_imputed.select_dtypes(include=[np.float64])
to_scale_test = X_train_imputed.select_dtypes(include=[np.float64])

# standardization of dependent variables
X_train_imputed_scaled = rs.fit_transform(to_scale_train)
X_test_imputed_scaled = rs.fit_transform(to_scale_test)

In [None]:
x_df_train = pd.DataFrame(X_train_imputed_scaled)
x_df_train[35] = y_train
x_df_train.dropna(inplace=True)

x_df_test = pd.DataFrame(X_test_imputed_scaled)
x_df_test[35] = y_test
x_df_test.dropna(inplace=True)

# New Imputation

In [45]:
stub_df = pd.DataFrame(columns=['CPH_EV_AGE_CALC', 'TBL.PATIENT.INFO..PI_BL_GENDER', 'CPH_EV_CI_BMI_CALC', 
                         'TBL.ALL.EVENTS..AE_SF_ALCO_XS', 'insulin_resistance', 'hypertensive', 'waist_to_hip_ratio',
                         'idf_metabolic_syndrome', 'eGFR', 'dyslipidaemia', 'fibroscan_stiffness_reliable',
                         'TBL.ALL.EVENTS..AE_BR_ALT_iuL',
                         'TBL.ALL.EVENTS..AE_BR_AST_iuL', 'TBL.ALL.EVENTS..AE_BR_GGT_iuL',
                         'TBL.ALL.EVENTS..AE_BR_FERR_ugL',
                         'TBL.ALL.EVENTS..AE_BR_PLT_109L', 'TBL.ALL.EVENTS..AE_BR_CREAT_umolL_CALC',
                         'TBL.ALL.EVENTS..AE_BR_STG_mmolL_CALC',
                         'TBL.ALL.EVENTS..AE_BR_ALBU_gL_CALC', 'TBL.ALL.EVENTS..AE_BR_BILI_umolL_CALC',
                         'TBL.ALL.EVENTS..AE_BR_IGA',
                         'TBL.ALL.EVENTS..AE_CD_OSA', 'LIT_NB_CK18_M30', 'LIT_NB_CK18_M65', 'LIT_NB_PRO_C3', 'LIT_NB_PRO_C6',
                          'LIT_NB_ELF', 'FIB4', 'NFS', 'APRI', 'ADAPT', 'FIBC3', 'ABC3D', 'BARD', 'AST_ALT_Ratio'])

In [None]:
stub_df

In [None]:
stub_df

In [None]:
# Imports
import datetime
# Used for analytics
imputes = 0
# Max absolute time difference - 6 Months
# Add date and paitent identifier and event type
main_df['CPH_ADM_BIOPSY_EVENT_DATE'] = df_raw['CPH_ADM_BIOPSY_EVENT_DATE']
main_df['SPIC'] = df_raw['SPIC']
main_df['CPH_EV_EVENT_TYPE'] = df_raw['CPH_EV_EVENT_TYPE']
# Drop observations with no date
main_df = main_df[main_df['CPH_ADM_BIOPSY_EVENT_DATE'].notna()]
# Group data by paitnet ID
main_df_grouped = main_df.groupby('SPIC')
# Loop over the agregated data
for group_name, df_group in main_df_grouped:
  # Remove any duplicate rows
  df_group = df_group.drop_duplicates()
  # Sort the group based upon date
  df_group = df_group.sort_values(by=['CPH_ADM_BIOPSY_EVENT_DATE'])
  # Select the baseline observations
  baseline_row = df_group.loc[df_group['CPH_EV_EVENT_TYPE'] == "Baseline"]
  # If baseline row is non existant tell user
  if baseline_row.empty:
    print('No baseline for paitent: ' + df_group['SPIC'].item())
  # Add basleine row to dataframe  
  elif df_group.shape[0] <= 1:
    stub_df = pd.concat([stub_df, baseline_row], ignore_index=True)
  # Only select data to impute that has multiple observations
  elif df_group.shape[0] > 1 :
    # Selects date of baseline observation
    start_date = baseline_row['CPH_ADM_BIOPSY_EVENT_DATE']
    # Convert from series to datetime
    start_date = datetime.datetime(start_date.dt.year,start_date.dt.month, start_date.dt.day)
    # Calculate absoulte time differenance
    df_group['Time Difference'] = abs(df_group['CPH_ADM_BIOPSY_EVENT_DATE'] - start_date)
    # Remove all rows where time difference is to large to impute
    df_group = df_group.loc[df_group['Time Difference'] < datetime.timedelta(days=200)]
    # Check again to see if any data is imputeable
    if df_group.shape[0] > 1:
      # Select relevant data to impute
      baseline_x = baseline_row.iloc[:, 0:35]
      # for each column in the baseline row 
      for (columnName, columnData) in baseline_x.iteritems():
        # Select value of current observation
        val = columnData.item()
        # If value is NaN
        if np.isnan(val):
          # For all other observations of the same paitent
          for i in range(df_group.shape[0]-1):
            # Select next observation 
            next = df_group.iloc[i+1]
            new_val = next[columnName].item()
            # If next value is NaN move on
            if np.isnan(new_val):
              break
            # If next value is not NaN add to baseline observation
            else:
              columnData.item = new_val
              imputes = imputes + 1
              print('Imputing: ' + columnName + ' With: ' + str(new_val))
    else:
      print('Time difference to large for paitent: ' + df_group['SPIC'].item())
  # Select new imputed basline
  baseline_row = df_group.loc[df_group['CPH_EV_EVENT_TYPE'] == "Baseline"]
  stub_df = pd.concat([stub_df, baseline_row], ignore_index=True)
  # Reomve duplicates
  stub_df = stub_df.drop_duplicates()

In [None]:
stub_df.drop_duplicates()