# Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from os.path import isfile

from tqdm.auto import tqdm

pd.options.display.float_format = '{:.3f}'.format

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from sklearn.model_selection import KFold
from sklearn.utils import shuffle

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, r2_score
from scipy.spatial import distance
from sklearn.preprocessing import StandardScaler

# Functions

In [None]:
def extract_data(df_in, feature, model):
    
    # The rows which have a missing value in the feature
    rows = (~df_in[feature].isnull())
    
    # Which columns to use for training
    cols = df_in.drop(['subjid', 'visit', 'studyid', 'visdy', 'visstat', 'hdcat', 'seq'] + [feature], axis=1)\
                .dropna(axis='columns').columns
    
    # The labels
    y = df_in.loc[rows,feature].values
    
    # Get training data
    if model == 'knn':
        scaler = StandardScaler()
        scaler.fit(df_in.loc[:,cols].values)
        X = scaler.transform(df_in.loc[rows,cols].values)
    else:
        X = df_in.loc[rows,cols].values
    
    # Get missing data
    mhx_cols = ['hxtobcpd', 'hxtobyos', 'hxpacky', 'hxmarfrq', 'hxherfrq', 'hxcocfrq', 'hxclbfrq', 'hxampfrq',
                'hxritfrq', 'hxhalfrq', 'hxinhfrq', 'hxopifrq', 'hxpakfrq', 'hxbarfrq', 'hxtrqfrq']
    
    # Feature is age related, keep only the first visit
    if ('age' in feature or feature in ['rtrddur', 'sxfam', 'sxsubj', 'hddiagn']) or (feature in mhx_cols):
        df_missing = df_in.loc[~rows,:].reset_index().groupby('subjid').first().reset_index()
        df_missing.index = df_missing['index']
        df_missing = df_missing.drop('index', axis=1)
        
        # Transform if Knn
        if model == 'knn':
            X_missing = scaler.transform(df_missing.loc[:,cols].values)
        else:
            X_missing = df_missing.loc[:,cols].values
        y_missing = df_missing.loc[:,feature].values
    # Keep all visits
    else:
        df_missing = df_in.loc[~rows,:]
        if model == 'knn':
            X_missing = scaler.transform(df_in.loc[~rows,cols].values)
        else:
            X_missing = df_in.loc[~rows,cols].values
        y_missing = df_in.loc[~rows,feature].values
    
    return X, y, X_missing, df_missing.index

In [None]:
def choose_model(model, problem):
    if model == 'lin':
        if problem == 'reg' or problem == 'ordinal':
            reg = LinearRegression(n_jobs=-1)
        elif problem == 'class':
            reg = LogisticRegression(n_jobs=-1, class_weight='balanced', random_state=42)
    elif model == 'tree':
        if problem == 'reg' or problem == 'ordinal':
            reg = RandomForestRegressor(100, n_jobs=-1, random_state=42)
        elif problem == 'class':
            reg = RandomForestClassifier(100, n_jobs=-1, class_weight='balanced', random_state=42)
    else:
        if problem == 'reg'  or problem == 'ordinal':
            reg = KNeighborsRegressor(n_neighbors=5, weights='distance', n_jobs=-1)
        elif problem == 'class':
            reg = KNeighborsClassifier(n_neighbors=5, weights='distance', n_jobs=-1)
    return reg

In [None]:
def train_model(X, y, feature, model='knn', problem='reg'):
    """
    :param df_in: The df used to train the model on
    :param feature: The column to imput
    :param missing: Number of missing values in feature
    :param n: 
    :param k: The number of K-folds for the cross validation
    :param model: Name of model to use (lin, tree, knn)
    :param problem: The prediction problem
    :param verbose: Whether the model should return informative output
    
    Makes a training dataset and trains the given model on it using K-fold cross validation.
    
    return: train and test evaluation scores per k-fold, model, feature
    """        
    # Create K-folds
    kfold_models = []
    train_stats = []
    test_stats = []
    i = 1
    
    k = 10
    kf = KFold(n_splits=k, random_state=42, shuffle=True)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Choose model
        reg = choose_model(model, problem)

        # Train models on fold
        try:
            reg.fit(X_train, y_train)               
        except:
            print('err')
            return np.zeros((10, 8)), np.zeros((10, 8))

        # Add to list of models
        kfold_models.append(reg)

        # Get statistics of model
        train_stats.append([feature, df.shape[0] - X.shape[0], model, problem, i] + 
                           evaluation(feature, np.mean(y), X_train, y_train, reg, problem))

        test_stats.append([feature, df.shape[0] - X.shape[0], model, problem, i] + 
                          evaluation(feature, np.mean(y), X_test, y_test, reg, problem))
        i+=1

    return kfold_models, train_stats, test_stats

In [None]:
def clip_prediction(feature, pred):
    if ((isinstance(pred, int) or isinstance(pred, float)) and 
        feature not in ['pf', 'rp', 'bp', 'gh', 'vt', 'sf', 're', 'mh', 'pcs', 'mcs']):
        pred = np.array(pred)

    if feature in ['hxmarfrq', 'hxherfrq', 'hxcocfrq', 'hxclbfrq', 'hxampfrq', 'hxritfrq', 'hxhalfrq', 'hxinhfrq',
                     'hxopifrq', 'hxpakfrq', 'hxbarfrq', 'hxtrqfrq']:
        # HX drug abuse
        return np.round(pred, 0).astype(int).clip(1, 3)
    elif feature in ['marfrq', 'herfrq', 'cocfrq', 'clbfrq', 'ampfrq', 'ritfrq', 'halfrq', 'inhfrq', 'opifrq',
                     'pakfrq', 'barfrq', 'trqfrq']:
        # Drug abuse
        return np.round(pred, 0).astype(int).clip(1, 3)
    elif feature in ['ocularh', 'ocularv', 'sacinith', 'sacinitv', 'sacvelh', 'sacvelv', 'dysarth', 'tongue',
                     'fingtapr', 'fingtapl', 'prosupr', 'prosupl', 'luria', 'rigarmr', 'rigarml', 'brady',
                     'dysttrnk', 'dystrue', 'dystlue', 'dystrle', 'dystlle', 'chorface', 'chorbol', 'chortrnk',
                     'chorrue', 'chorlue', 'chorrle', 'chorlle', 'gait', 'tandem', 'retropls', 'diagconf']:
        # Motorscore 0-4
        return np.round(pred, 0).astype(int).clip(0, 4)
    elif feature in ['occupatn', 'finances', 'adl']:
        # TFCscore 0-3
        return np.round(pred, 0).astype(int).clip(0, 3)
    elif feature in ['chores', 'carelevl']:
        # TFCscore 0-2
        return np.round(pred, 0).astype(int).clip(0, 2) 
    elif feature == 'indepscl':
        # Fascore Transform to nearest 5 percentage
        return (5 * np.round(pred.clip(0) / 5, 0)).astype(int)
    elif feature in ['pbas1sv', 'pbas1fr', 'pbas1wo', 'pbas2sv', 'pbas2fr', 'pbas2wo', 'pbas3sv', 'pbas3fr',
                     'pbas3wo', 'pbas4sv', 'pbas4fr', 'pbas4wo', 'pbas5sv', 'pbas5fr', 'pbas5wo', 'pbas6sv',
                     'pbas6fr', 'pbas6wo', 'pbas7sv', 'pbas7fr', 'pbas7wo', 'pbas8sv', 'pbas8fr', 'pbas8wo',
                     'pbas9sv', 'pbas9fr', 'pbas9wo', 'pbas10sv', 'pbas10fr', 'pbas10wo', 'pbas11sv', 'pbas11fr',
                     'pbas11wo']:
        return np.round(pred, 0).astype(int).clip(0, 4)
    else:
        # ints
        return np.round(pred, 0).astype(int)
    return pred

In [None]:
def evaluation(feature, m, X, y, reg, problem):
    """
    :param feature: which feature to tranform
    :param m: the mean of the predictions
    :param X: the input data
    :param y: the labeled data
    :param reg: the model to predict values
    :param problem: the type of prediction: class, ordinal or reg
    
    Prediction is done and then the values are clipped and rounded to the correct format.
    Format depdends on the feature that is predicted.
    After that the function returns the r2/f1 score, the mae and the rmse
    return:
    """
    pred = clip_prediction(feature, reg.predict(X))
    mean = clip_prediction(feature, m)

    if problem == 'reg':
        SS_tot = np.sum(np.power(y - mean, 2))
        SS_res = np.sum(np.power(y - pred, 2))
        score = 1 - (SS_res/SS_tot)
    elif problem == 'class' or problem == 'ordinal':
        score = f1_score(y, pred, average='weighted')
    
    if problem == 'ordinal':
        # Only take wrong values
        dif = y - pred
        mae = np.mean(np.abs(dif[dif != 0]))
        rmse = np.sqrt(np.mean(np.power(dif[dif != 0], 2)))
    elif problem == 'class' or problem == 'reg':
        mae = mean_absolute_error(y, pred)
        rmse = np.sqrt(mean_squared_error(y, pred))
    
    return [score, mae, rmse]

# Load data

In [None]:
df = pd.read_csv('data/filtered_pre_and_manifest.csv')
df.head()

In [None]:
df.columns.values

<b> Divide categorical and ordinal columns </b>

In [None]:
cat_cols = ['emplusl', 'emplany', 'volunt', 'fafinan', 'grocery', 'cash', 'supchild', 'drive', 'housewrk',
            'laundry', 'prepmeal', 'telephon', 'ownmeds', 'feedself', 'dress', 'bathe', 'pubtrans', 'walknbr',
            'walkfall', 'walkhelp', 'comb', 'trnchair', 'bed', 'toilet', 'carehome',
           ]

ord_cols = ['hxmarfrq', 'hxherfrq', 'hxcocfrq', 'hxclbfrq', 'hxampfrq', 'hxritfrq', 'hxhalfrq', 'hxinhfrq',
            'hxopifrq', 'hxpakfrq', 'hxbarfrq', 'hxtrqfrq', # medical
            'marfrq', 'herfrq', 'cocfrq', 'clbfrq', 'ampfrq', 'ritfrq', 'halfrq', 'inhfrq', 'opifrq',
            'pakfrq', 'barfrq', 'trqfrq', # General 1
            'ocularh', 'ocularv', 'sacinith', 'sacinitv', 'sacvelh', 'sacvelv', 'dysarth', 'tongue', 'fingtapr',
            'fingtapl', 'prosupr', 'prosupl', 'luria', 'rigarmr', 'rigarml', 'brady', 'dysttrnk', 'dystrue',
            'dystlue', 'dystrle', 'dystlle', 'chorface', 'chorbol', 'chortrnk', 'chorrue', 'chorlue', 'chorrle',
            'chorlle', 'gait', 'tandem', 'retropls', 'diagconf', # Motor
            'occupatn', 'finances', 'chores', 'adl', 'carelevl', # TFC
            'indepscl', # Fascore
            'pbas1sv', 'pbas1fr', 'pbas1wo', 'pbas2sv',
            'pbas2fr', 'pbas2wo', 'pbas3sv', 'pbas3fr', 'pbas3wo', 'pbas4sv', 'pbas4fr', 'pbas4wo', 'pbas5sv',
            'pbas5fr', 'pbas5wo', 'pbas6sv', 'pbas6fr', 'pbas6wo', 'pbas7sv', 'pbas7fr', 'pbas7wo', 'pbas8sv',
            'pbas8fr', 'pbas8wo', 'pbas9sv', 'pbas9fr', 'pbas9wo', 'pbas10sv', 'pbas10fr', 'pbas10wo',
            'pbas11sv', 'pbas11fr', 'pbas11wo', # PBA
           ]

### Train & evaluate models

In [None]:
def train_imputation_models(df_in, ordered_features, old_results, impute):    
    known_variables = old_results[0].variable.values.reshape(-1)
    original_df = df_in.copy()
    
    old_new_model = []
    
    train_results = list(old_results[0].values)
    test_results = list(old_results[1].values)
    
    pbar = tqdm(total=len(ordered_features))
    for f in ordered_features:
        # Skip features which have already been imputed
        if f in known_variables:
            pbar.set_description("Skipping...")
            pbar.update(1)
            continue
        
        subset = df_in.copy()
        
        # Get correct problem
        if f in cat_cols:
            p = 'class'
        elif f in ord_cols:
            p = 'ordinal'
        else:
            p = 'reg'
        
        # Take subset of dataset if in features
#         if 'age' in f or f in ['rtrddur', 'sxfam', 'sxsubj', 'hddiagn',
#                                'hxtobcpd', 'hxtobyos', 'hxpacky', 'hxmarfrq', 'hxherfrq', 'hxcocfrq', 'hxclbfrq', 'hxampfrq',
#                                'hxritfrq', 'hxhalfrq', 'hxinhfrq', 'hxopifrq', 'hxpakfrq', 'hxbarfrq', 'hxtrqfrq']:
#             # 
#             subset = subset.loc[(subset[f] != 0) | (df_in[f].isnull())]
            # subset = subset.drop_duplicates(['subjid']).groupby('subjid').head(1)
        if f in ['alcunits', 'tobcpd', 'tobyos', 'packy', 'cafpd', 'sbh1n', 'sbh3n', 'sbh4n',
                 'marfrq', 'herfrq', 'cocfrq', 'clbfrq', 'ampfrq', 'ritfrq', 'halfrq', 'inhfrq', 'opifrq',
                 'pakfrq', 'barfrq', 'trqfrq', # General 1
                ]:
            # Ignore features where 0 means does not do it or nothing
            subset = subset.loc[(subset[f] > 0)  | (df_in[f].isnull())]
        
        # If the number of not missing rows is smaller than the number of k-fold, continue
        if (~subset[f].isnull()).sum() <= 10 or subset.shape[0] <= 10:
            pbar.set_description("Skipping...")
            pbar.update(1)
            continue
        
        # Train models
        imputation_models = {}
        for model in ['lin', 'tree', 'knn']:
            pbar.set_description("Training {} model".format(model))
            input_data, labels, _, _ = extract_data(subset.copy(), f, model)
            imputation_models[model], train_s, test_s = train_model(input_data, labels, f, model=model, problem=p)
            for fold in range(10):
                train_results.append(train_s[fold])
                test_results.append(test_s[fold])

        # Impute using best model
        if impute:
            mean_score = np.mean(np.array(test_results[-30:])[:,5].reshape(3,10).astype(float), axis=1)
            m = np.argmax(mean_score)
            
            # Check if new model is better than previous model
            if mean_score[m] < test_best1.loc[test_best1['variable'] == f, 'score'].mean():
                model = test_best1.loc[test_best1['variable'] == f, 'model'].unique()[0]
                print(f, model)
                input_data, labels, missing_data, missing_index = extract_data(original_df.copy(), f, model)
                imputation_models[model], train_s, test_s = train_model(input_data, labels, f, model=model, problem=p)
                pbar.set_description("Imputing " + f + ", using older " + model + " model")
                old_new_model.append([f, model, 1])
            else:
                _, _, missing_data, missing_index = extract_data(subset.copy(), f, model)
                # Get correct model
                if m == 0:
                    model = 'lin'
                    pbar.set_description("Imputing " + f + ", using linear model")
                    # val, pred, idx_mis, size = imp_missing(subset, f, model='lin', problem=p, verbose=False)
                elif m == 1:
                    model = 'tree'
                    pbar.set_description("Imputing " + f + ", using tree model")
                    # val, pred, idx_mis, size = imp_missing(subset, f, model='tree', problem=p, verbose=False)
                else:
                    model = 'knn'
                    pbar.set_description("Impute " + f + ", using Knn")
                    # val, pred, idx_mis, size = imp_missing(subset, f, model='knn', problem=p, verbose=False)
                old_new_model.append([f, model, 2])

            pred = np.array([imputer.predict(missing_data) for imputer in imputation_models[model]]).mean(axis=0)
            pred = clip_prediction(f, pred)

            # if 'age' in f or f in ['rtrddur', 'sxfam', 'sxsubj', 'hddiagn']:
            #     df_in = imp_age(df_in.copy(), f, missing_index, pred)
            if 'age' in f or f in ['rtrddur', 'sxfam', 'sxsubj', 'hxtobcpd', 'hxtobyos', 'hxpacky',
                                   'hxmarfrq', 'hxherfrq', 'hxcocfrq', 'hxclbfrq', 'hxampfrq','hxritfrq',
                                   'hxhalfrq', 'hxinhfrq', 'hxopifrq', 'hxpakfrq', 'hxbarfrq', 'hxtrqfrq']:
                df_in = imp_mhx(df_in.copy(), f, missing_index, pred)
            else:
                df_in.loc[missing_index,f] = pred
        # End loop
        pbar.update(1)
    
    # Save best model per round
    if impute:
        pd.DataFrame(old_new_model, columns=['variable', 'model', 'round']).to_csv('tables/best_models.csv', index=False)
    
    train_results = pd.DataFrame(train_results, columns=['variable', 'missing', 'model',
                                                         'problem', 'fold', 'score', 'mae', 'rmse'])
    test_results = pd.DataFrame(test_results, columns=['variable', 'missing', 'model',
                                                       'problem', 'fold', 'score', 'mae', 'rmse'])
    if impute:
        return df_in, train_results, test_results
    else:
        return train_results, test_results

# Imputation round 1 (evaluation)

<b> Load results round 1

In [None]:
# Use older training results if they are available
if isfile('tables/impute_train_1.csv'):
    train_results1 = pd.read_csv('tables/impute_train_1.csv')
else:
    train_results1 = pd.DataFrame([], columns=['variable', 'missing', 'model', 'problem', 'fold', 'score', 'mae', 'rmse'])

if isfile('tables/impute_test_1.csv'):
    test_results1 = pd.read_csv('tables/impute_test_1.csv')
else:
    test_results1 = pd.DataFrame([], columns=['variable', 'missing', 'model', 'problem', 'fold', 'score', 'mae', 'rmse'])
    
print(train_results1.shape)
print(test_results1.shape)

In [None]:
# Uncomment to reset results
# test_results1 = pd.DataFrame([], columns=['variable', 'missing', 'model', 'problem', 'fold', 'score', 'mae', 'rmse'])
# train_results1 = pd.DataFrame([], columns=['variable', 'missing', 'model', 'problem', 'fold', 'score', 'mae', 'rmse'])

<b> Get missing features and remove features which can be calculated by using other variables </b>

In [None]:
mis = df.isnull().sum().sort_values(ascending=True).replace(0, np.nan).dropna()\
        .drop(['motscore', 'fascore', 'tfcscore',
               'depscore', 'irascore', 'psyscore', 'aptscore', 'exfscore',
               'hxpacky', 'packy',
               # 'hddiagn',
              ], axis='index')
mis.head()

<b> Evaluate imputation models

In [None]:
train_results1, test_results1 = train_imputation_models(df.copy(), mis.index,
                                                        (train_results1, test_results1),
                                                        impute=False
                                                       )

<b> Save results

In [None]:
# Combine old and new results, if needed
# train_results1 = to_df(train_results1, new_train_results1, 'train_1')
# test_results1 = to_df(test_results1, new_test_results1, 'test_1')
if True:
    train_results1.to_csv('tables/impute_train_1.csv', index=False)
    test_results1.to_csv('tables/impute_test_1.csv', index=False)
else:
    print('Not saved')

<b> Get best models per variable

In [None]:
def best_models(scores):
    """
    :param scores: Performance of the imputation models
    
    Calculate the best models based on the mean r2/f1 score of the folds per feature
    """
    
    # Mean score per variable and model
    means = scores.groupby(["variable", "model"]).mean().reset_index()
    # Models per variable with highest mean score
    models = means.loc[means.groupby('variable').score.idxmax(), :]
    
    # Get values from all the best scores
    rows = []
    for i, row in enumerate(models.sort_values('score', ascending=False)[['variable', 'model']].values):
        for r in scores[(scores[['variable', 'model']].isin(row)).all(1)].values:
            rows.append(r)
    
    out = pd.DataFrame(rows,
                       columns=['variable', 'missing', 'model', 'problem', 'fold', 'score', 'mae', 'rmse'],
                      )
    out = out.astype({'variable': str, 'missing': int, 'model': str, 'problem': str, 'fold': int,
                      'score': float, 'mae': float, 'rmse': float})
    return out

In [None]:
train_best1 = best_models(train_results1.dropna())
test_best1 = best_models(test_results1.dropna())

# Round 2 Impute

In [None]:
def imp_age(df_in, f, idx_mis, pred):
    """
    Only the first visit is imputed
    After that recalculate the correct duration at the next visits
    In the next visit the patient might be 0 years older or 1+ years older
    """
    # Add predictions to first visits for each subject
    df_in.loc[idx_mis,f] = pred
    for i in idx_mis:
        # Get the predicted age variable (x), subject (s) and age (a) in first visit
        x = df_in.loc[i,f]
        s = df_in.loc[i,'subjid']
        a = df_in.loc[i, 'age']
        visit = 1
        # Check if next index is still the same subject
        while s == df_in.loc[i + visit,'subjid']:
            # Take age from visit
            a_new = df_in.loc[i+visit, 'age']
            # Duration = prediction + (current age - first visit age)
            df_in.loc[i+visit,f] = x + (a_new - a)
            # Next index/visit
            visit += 1
    return df_in

In [None]:
def imp_mhx(df_in, f, idx_mis, pred):
    """
    Impute baseline variables.
    Only on the first visit a prediction is made
    Fill in the other visits using prediction at first visit
    """
    df_in.loc[idx_mis,f] = pred
    for i in idx_mis:
        # Get prediction at first visit
        x = df_in.loc[i,f]
        # Get subject at first visit
        s = df_in.loc[i,'subjid']
        visit = 1
        # If the next row is still the same subject, impute using prediction at first visit
        while s == df_in.loc[i + visit,'subjid']:
            df_in.loc[i+visit,f] = x
            visit += 1
    return df_in

<b> Load results2

In [None]:
if isfile('tables/impute_train_2.csv'):
    train_results2 = pd.read_csv('tables/impute_train_2.csv')
else:
    train_results2 = pd.DataFrame([], columns=['variable', 'missing', 'model', 'problem', 'fold', 'score', 'mae', 'rmse'])

if isfile('tables/impute_test_2.csv'):
    test_results2 = pd.read_csv('tables/impute_test_2.csv')
else:
    test_results2 = pd.DataFrame([], columns=['variable', 'missing', 'model', 'problem', 'fold', 'score', 'mae', 'rmse'])
    
print(train_results2.shape)
print(test_results2.shape)

In [None]:
# Uncomment to reset results
train_results2 = pd.DataFrame([], columns=['variable', 'missing', 'model', 'problem', 'fold', 'score', 'mae', 'rmse'])
test_results2 = pd.DataFrame([], columns=['variable', 'missing', 'model', 'problem', 'fold', 'score', 'mae', 'rmse'])

<b> Impute in order of test_best1

In [None]:
new_df = df.copy()
new_df, train_results2, test_results2 = train_imputation_models(new_df, test_best1.variable.unique(),
                                                                (train_results2, test_results2),
                                                                impute=True,
                                                               )

<b> Save results

In [None]:
if True:
    train_results2.to_csv('tables/impute_train_2.csv', index=False)
    test_results2.to_csv('tables/impute_test_2.csv', index=False)
else:
    print('Not saved')

<b> Get best model per variable

In [None]:
train_best2 = best_models(train_results2.dropna())
test_best2 = best_models(test_results2.dropna())

<b> Columns with missing values

In [None]:
df.loc[:,df.isnull().any()].columns

In [None]:
new_df.loc[:,new_df.isnull().any()].columns

<b> Recalculate variables with imputed values

In [None]:
new_df['packy'] = ((new_df['tobcpd'] * 0.05) * new_df['tobyos']).round(1)
new_df['hxpacky'] = ((new_df['hxtobcpd'] * 0.05) * new_df['hxtobyos']).round(1)

new_df['motscore'] = new_df[['ocularh', 'ocularv', 'sacinith', 'sacinitv', 'sacvelh', 'sacvelv', 'dysarth', 'tongue',
                             'fingtapr', 'fingtapl', 'prosupr', 'prosupl', 'luria', 'rigarmr', 'rigarml', 'brady',
                             'dysttrnk', 'dystrue', 'dystlue', 'dystrle', 'dystlle', 'chorface', 'chorbol', 'chortrnk',
                             'chorrue', 'chorlue', 'chorrle', 'chorlle', 'gait', 'tandem', 'retropls', 'diagconf']].sum(axis=1)

new_df['tfcscore'] = new_df[['occupatn', 'finances', 'chores', 'adl', 'carelevl']].sum(axis=1)

new_df['fascore'] = new_df[['emplusl', 'emplany', 'volunt', 'fafinan', 'grocery', 'cash', 'supchild', 'drive', 'housewrk',
                            'laundry', 'prepmeal', 'telephon', 'ownmeds', 'feedself', 'dress', 'bathe', 'pubtrans', 'walknbr',
                            'walkfall', 'walkhelp', 'comb', 'trnchair', 'bed', 'toilet', 'carehome']].sum(axis=1)

new_df['depscore'] = ((new_df['pbas1sv'] * new_df['pbas1fr']) + (new_df['pbas2sv'] * new_df['pbas2fr']) + 
                      (new_df['pbas3sv'] * new_df['pbas3fr']))

new_df['irascore'] = (new_df['pbas4sv'] * new_df['pbas4fr']) + (new_df['pbas5sv'] * new_df['pbas5fr'])

new_df['psyscore'] = (new_df['pbas9sv'] * new_df['pbas9fr']) + (new_df['pbas10sv'] * new_df['pbas10fr'])

new_df['aptscore'] = (new_df['pbas6sv'] * new_df['pbas6fr'])

new_df['exfscore'] = (new_df['pbas7sv'] * new_df['pbas7fr']) + (new_df['pbas8sv'] * new_df['pbas8fr'])

new_df['dbscore'] = (new_df['pbas11sv'] * new_df['pbas11fr'])

# Take mean, couldnt be imputed
new_df['sbh1n'] = int(round(df['sbh1n'], 0).mean())
new_df['sbh3n'] = int(round(df['sbh1n'], 0).mean())
new_df['sbh4n'] = int(round(df['sbh1n'], 0).mean())

<b> Remove remaining missing values

In [None]:
new_df.isnull().any().sum()

In [None]:
new_df.loc[:,new_df.isnull().any()].columns

In [None]:
new_df.drop(new_df.loc[:,new_df.isnull().any()].columns, axis='columns', inplace=True)

<b> Save imputed dataset </b>

In [None]:
new_df.to_csv('data/imputed_pre_and_manifest.csv', index=False)