In [1]:
import pandas as pd
from statsmodels.formula.api import glm
import numpy as np

# --------- Helper functions ---------

def gender_to_binary(gender):
    """
    This function conver gender (male/female) to binary values,
    such as: male = 1, female = 2 (do not use 0, for modeling purpose)
    parameters: gender is a string of gender (male or female)
    """
    if gender=="Male" or gender=="MALE":
        return 1
    else: return 2

# --------- End of helper functions --------- 


a1c_fn = 'a1c.txt'
glucose_fn = 'glucose.txt'
bmi_fn = 'bmi.txt'

df_a1c = pd.read_csv(a1c_fn, sep='\t')
df_glucose = pd.read_csv(glucose_fn, sep='\t')
df_bmi = pd.read_csv(bmi_fn, sep='\t')

In [2]:
df_a1c['age'] = df_a1c['year_of_obs'] - df_a1c['birth_year']
df_glucose['age'] = df_glucose['year_of_obs'] - df_glucose['birth_year']
df_bmi['age'] = df_bmi['year_of_obs'] - df_bmi['birth_year']

df_a1c['merge_ID'] = df_a1c['patient_ID'].astype('str') + df_a1c['age'].astype('str')
df_glucose['merge_ID'] = df_glucose['patient_ID'].astype('str') + df_glucose['age'].astype('str')
df_bmi['merge_ID'] = df_bmi['patient_ID'].astype('str') + df_bmi['age'].astype('str')

merged_df = df_a1c.merge(df_glucose, left_on='merge_ID', right_on='merge_ID')
merged_df.drop(['year_of_obs_x', 'birth_year_x', 'gender_x',
       'patient_ID_x', 'age_x'], axis=1, inplace=True)
merged_df.columns = ['a1c', 'merge_ID', 'glucose', 'year_of_obs',
       'birth_year', 'gender', 'patient_ID', 'age']

merged_df = merged_df.merge(df_bmi, left_on='merge_ID', right_on='merge_ID')
merged_df.drop(['merge_ID', 'year_of_obs_y','birth_year_y',
                'gender_y', 'patient_ID_y', 'age_y'], axis=1, inplace=True)
merged_df = merged_df[['a1c', 'glucose','obs_value', 'year_of_obs_x', 'birth_year_x', 'gender_x',
       'patient_ID_x', 'age_x']]
merged_df.columns = ['a1c', 'glucose','bmi', 'year_of_obs', 'birth_year', 'gender',
       'patient_ID', 'age']

merged_df['gender'] = merged_df['gender'].apply(gender_to_binary)
merged_df['age_gender'] = merged_df['age'] * merged_df['gender']

merged_df.head()

Unnamed: 0,a1c,glucose,bmi,year_of_obs,birth_year,gender,patient_ID,age,age_gender
0,6.255829,77.264588,27.990475,2010,1948,1,181,62,62
1,5.977413,74.575771,27.990475,2011,1948,1,181,63,63
2,5.827048,91.568176,27.990475,2012,1948,1,181,64,64
3,6.332716,99.539456,27.990475,2013,1948,1,181,65,65
4,6.331426,98.075706,27.990475,2014,1948,1,181,66,66


In [3]:
# Fit generalized linear model
a1c_model = glm('a1c ~ age + gender + age_gender', data=merged_df)
a1c_result = a1c_model.fit()
print(a1c_result.summary(), '\n\n\n')

glucose_model = glm('glucose ~ age + gender + age_gender', data=merged_df)
glucose_result = glucose_model.fit()
print(glucose_result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    a1c   No. Observations:                10824
Model:                            GLM   Df Residuals:                    10820
Model Family:                Gaussian   Df Model:                            3
Link Function:               identity   Scale:                         0.46297
Method:                          IRLS   Log-Likelihood:                -11189.
Date:                Sun, 29 Nov 2020   Deviance:                       5009.3
Time:                        02:00:39   Pearson chi2:                 5.01e+03
No. Iterations:                     3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      8.1560      0.189     43.098      0.0

In [4]:
# get previous a1c, glucose and bmi value
merged_df['prev_a1c'] = np.nan
merged_df['prev_glucose'] = np.nan

lst_patient = merged_df['patient_ID'].unique()
merged_df.sort_values(by='patient_ID', inplace=True)

for patient in lst_patient:
    mask = merged_df['patient_ID']==patient
    index = merged_df.loc[mask].index
    merged_df.loc[index[1:], 'prev_a1c'] = merged_df.loc[index, 'a1c'].values[:-1]
    merged_df.loc[index[1:], 'prev_glucose'] = merged_df.loc[index, 'glucose'].values[:-1]
    

In [6]:
# Fit generalized linear model
a1c_model_2 = glm('a1c ~ age + gender + prev_a1c', data=merged_df)
a1c_result_2 = a1c_model_2.fit()
print(a1c_result_2.summary(), '\n\n\n')

glucose_model_2 = glm('glucose ~ age + gender + prev_glucose', data=merged_df)
glucose_result_2 = glucose_model_2.fit()
print(glucose_result_2.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    a1c   No. Observations:                10769
Model:                            GLM   Df Residuals:                    10765
Model Family:                Gaussian   Df Model:                            3
Link Function:               identity   Scale:                        0.015533
Method:                          IRLS   Log-Likelihood:                 7146.9
Date:                Sun, 29 Nov 2020   Deviance:                       167.21
Time:                        02:02:21   Pearson chi2:                     167.
No. Iterations:                     3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0808      0.013      6.208      0.0