In [1]:
import pandas as pd
import numpy as np

In [2]:
combinedchis = pd.read_csv("data/combined_chis.csv")

## Separate input and output variables

In [3]:
X = combinedchis.iloc[:,:-1]
X

Unnamed: 0,AA5C,AB1,AB100,AB112,AB113,AB115,AB117,AB118,AB119,AB127,...,AH141,SREDUC,AJ153V2_13,AJ154BV2_8,AJ154BV2_9,INS64_S,AK20_P1,AJ174_8,AJ174_9,AJ194_18
0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.0,...,,,,,,,,,,
1,-1.0,5.0,-1.0,-1.0,-1.0,2.0,2.0,2.0,-1.0,1.0,...,,,,,,,,,,
2,-1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.0,...,,,,,,,,,,
3,-1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,...,,,,,,,,,,
4,-1.0,2.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189118,-1.0,2.0,,-1.0,,,,,,,...,-1.0,3.0,-1.0,-1.0,-1.0,5.0,4.0,-1.0,-1.0,-1.0
189119,-1.0,1.0,,-1.0,,,,,,,...,-1.0,4.0,-1.0,2.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0
189120,-1.0,2.0,,-1.0,,,,,,,...,-1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
189121,-1.0,3.0,,-1.0,,,,,,,...,-1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [4]:
y = combinedchis.iloc[:, -1]
y

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
189118    0.0
189119    0.0
189120    0.0
189121    0.0
189122    0.0
Name: T2D, Length: 189123, dtype: float64

## Split training and test data

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

## Fill NaN values and remove features

In [6]:
# remove id feature and features that already imply diabetes
diabetes_features = ["PUF1Y_ID", "AB111", "AB23_P1", "AB114_P1", "AJ82", "AB22V2", "DIABETES", "AB81", "AB113", "AB112", "AB51_P1", "AB110_P", "AB109", "AB24", "DIAMED", "AB25"]
X_train = X_train.drop(diabetes_features, axis=1)
X_test = X_test.drop(diabetes_features, axis=1)
X_train.shape

(132386, 768)

In [7]:
# Find the features that have continuous values
numerical_features = []
for column in X_train:
    # ignores the id feature
    if column != 'PUF1Y_ID':
        if X_train[column].unique().size >= 21:
            numerical_features.append(column)
print(numerical_features)
print(len(numerical_features))

['AC11', 'AD39W', 'AD41W', 'AD42W', 'AE2', 'AE3', 'AE7', 'AE_FRIES', 'AE_FRUIT', 'AE_SODA', 'AE_VEGI', 'DSTRSYR', 'AK10_P', 'AK10A_P', 'AK22_P', 'HEIGHM_P', 'HGHTI_P', 'HGHTM_P', 'WEIGHK_P', 'WGHTK_P', 'WGHTP_P', 'WT18K_P', 'WT18P_P', 'BMI_P', 'POVGWD_P1', 'DSTRS_P1', 'AD38W', 'AB26_P1', 'AC46', 'DISTRESS', 'AK7_P1', 'DIABCK_P1', 'AC47', 'AC59', 'AC87', 'AC52_P1', 'AC31', 'POVLL2_P1V2', 'AC111', 'TIMEAD_P1V2', 'AH128', 'AH5', 'AE5', 'AH132', 'AH136', 'TCURPLAN', 'AH129', 'AJ93', 'AH44B', 'AJ115_P1']
50


In [8]:
# remove the categorical variables in the numerical features list
numerical_features.remove('TIMEAD_P1V2')
numerical_features.remove('AJ115_P1')
print(numerical_features)
print(len(numerical_features))

['AC11', 'AD39W', 'AD41W', 'AD42W', 'AE2', 'AE3', 'AE7', 'AE_FRIES', 'AE_FRUIT', 'AE_SODA', 'AE_VEGI', 'DSTRSYR', 'AK10_P', 'AK10A_P', 'AK22_P', 'HEIGHM_P', 'HGHTI_P', 'HGHTM_P', 'WEIGHK_P', 'WGHTK_P', 'WGHTP_P', 'WT18K_P', 'WT18P_P', 'BMI_P', 'POVGWD_P1', 'DSTRS_P1', 'AD38W', 'AB26_P1', 'AC46', 'DISTRESS', 'AK7_P1', 'DIABCK_P1', 'AC47', 'AC59', 'AC87', 'AC52_P1', 'AC31', 'POVLL2_P1V2', 'AC111', 'AH128', 'AH5', 'AE5', 'AH132', 'AH136', 'TCURPLAN', 'AH129', 'AJ93', 'AH44B']
48


In [9]:
categorical_features = X_train.drop(numerical_features, axis=1)
categorical_features = categorical_features.columns

In [10]:
# from sklearn.compose import ColumnTransformer
# from sklearn.impute import SimpleImputer

# all_features = X_train.columns
# # create a pipeline that applies the transformations to the dataset
# pipeline = ColumnTransformer(
#     [
#         ("categorical", SimpleImputer(strategy="most_frequent"), categorical_features),
#         ("numerical", SimpleImputer(strategy="mean"), numerical_features)
#     ],
#     remainder="passthrough"
# )
# # apply transformations to both training and test
# # X_train = pd.DataFrame(pipeline.fit_transform(X_train), columns=pipeline.get_feature_names_out())
# # X_test = pd.DataFrame(pipeline.transform(X_test))

In [11]:
# fill the numerical features with the mean in the training set
for col in numerical_features:
    X_train[col] = X_train[col].fillna(X_train[col].mean())

In [12]:
# fill the numerical features with the mean in the test data
for col in numerical_features:
    X_test[col] = X_test[col].fillna(X_train[col].mean())

In [13]:
# fill in the categorical features with the mode
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="most_frequent")
X_train[:] = imputer.fit_transform(X_train)
X_test[:] = imputer.transform(X_test)

## Feature Selection

In [14]:
# chis = pd.concat([X_train, y_train], axis=1)
# corrmat = chis.corr()

In [15]:
# corrmat["T2D"].nlargest(12)

In [16]:
# features = corrmat["T2D"].nlargest(12)[2:].index
# features

In [17]:
# # checks if there are any NaN values in any of the features
# X_train[features].isnull().sum()

In [18]:
# # see if number of features are the same for training and testing 
# X_train = X_train[features]
# X_test = X_test[features]
# print(X_train.shape)
# print(X_test.shape)

## Convert training and test data to CSV files

In [19]:
X_train.to_csv('data\X_train.csv', index = False)
X_test.to_csv('data\X_test.csv', index = False)
y_train.to_csv('data\y_train.csv', index = False)
y_test.to_csv('data\y_test.csv', index = False)