In [119]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [120]:
combinedchis = pd.read_csv("data/combined_chis.csv")

## Separate input and output variables

In [121]:
X = combinedchis.iloc[:,:-1]
X

Unnamed: 0,AA5C,AB1,AB100,AB112,AB113,AB115,AB117,AB118,AB119,AB127,...,AH141,SREDUC,AJ153V2_13,AJ154BV2_8,AJ154BV2_9,INS64_S,AK20_P1,AJ174_8,AJ174_9,AJ194_18
0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.0,...,,,,,,,,,,
1,-1.0,5.0,-1.0,-1.0,-1.0,2.0,2.0,2.0,-1.0,1.0,...,,,,,,,,,,
2,-1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2.0,...,,,,,,,,,,
3,-1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,...,,,,,,,,,,
4,-1.0,2.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189118,-1.0,2.0,,-1.0,,,,,,,...,-1.0,3.0,-1.0,-1.0,-1.0,5.0,4.0,-1.0,-1.0,-1.0
189119,-1.0,1.0,,-1.0,,,,,,,...,-1.0,4.0,-1.0,2.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0
189120,-1.0,2.0,,-1.0,,,,,,,...,-1.0,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
189121,-1.0,3.0,,-1.0,,,,,,,...,-1.0,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [122]:
y = combinedchis.iloc[:, -1]
y

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
189118    0.0
189119    0.0
189120    0.0
189121    0.0
189122    0.0
Name: T2D, Length: 189123, dtype: float64

## Split training and test data

In [123]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

## Fill NaN values and remove features

In [124]:
# remove id feature and features that already imply diabetes
diabetes_features = ["PUF1Y_ID", "AB111", "AB23_P1", "AB114_P1", "AJ82", "AB22V2", "DIABETES", "AB81", "AB113", "AB112", "AB51_P1", "AB110_P", "AB109", "AB24", "DIAMED", "AB25"]
similar_features = ["WEIGHK_P", "WGHTK_P", "HEIGHM_P", "HGHTM_P", "WHOBMI"]
remove_features = diabetes_features + similar_features
X_train = X_train.drop(remove_features, axis=1)
X_test = X_test.drop(remove_features, axis=1)
X_train.shape

(132386, 763)

In [125]:
# Find the features that have continuous values
numerical_features = []
for column in X_train:
    # ignores the id feature
    if column != 'PUF1Y_ID':
        if X_train[column].unique().size >= 21:
            numerical_features.append(column)
print(numerical_features)
print(len(numerical_features))

['AC11', 'AD39W', 'AD41W', 'AD42W', 'AE2', 'AE3', 'AE7', 'AE_FRIES', 'AE_FRUIT', 'AE_SODA', 'AE_VEGI', 'DSTRSYR', 'AK10_P', 'AK10A_P', 'AK22_P', 'HGHTI_P', 'WGHTP_P', 'WT18K_P', 'WT18P_P', 'BMI_P', 'POVGWD_P1', 'DSTRS_P1', 'AD38W', 'AB26_P1', 'AC46', 'DISTRESS', 'AK7_P1', 'DIABCK_P1', 'AC47', 'AC59', 'AC87', 'AC52_P1', 'AC31', 'POVLL2_P1V2', 'AC111', 'TIMEAD_P1V2', 'AH128', 'AH5', 'AE5', 'AH132', 'AH136', 'TCURPLAN', 'AH129', 'AJ93', 'AH44B', 'AJ115_P1']
46


In [126]:
# remove the categorical variables in the numerical features list
numerical_features.remove('TIMEAD_P1V2')
numerical_features.remove('AJ115_P1')
print(numerical_features)
print(len(numerical_features))

['AC11', 'AD39W', 'AD41W', 'AD42W', 'AE2', 'AE3', 'AE7', 'AE_FRIES', 'AE_FRUIT', 'AE_SODA', 'AE_VEGI', 'DSTRSYR', 'AK10_P', 'AK10A_P', 'AK22_P', 'HGHTI_P', 'WGHTP_P', 'WT18K_P', 'WT18P_P', 'BMI_P', 'POVGWD_P1', 'DSTRS_P1', 'AD38W', 'AB26_P1', 'AC46', 'DISTRESS', 'AK7_P1', 'DIABCK_P1', 'AC47', 'AC59', 'AC87', 'AC52_P1', 'AC31', 'POVLL2_P1V2', 'AC111', 'AH128', 'AH5', 'AE5', 'AH132', 'AH136', 'TCURPLAN', 'AH129', 'AJ93', 'AH44B']
44


In [127]:
categorical_features = X_train.drop(numerical_features, axis=1)
categorical_features = categorical_features.columns

In [128]:
# from sklearn.compose import ColumnTransformer
# from sklearn.impute import SimpleImputer

# all_features = X_train.columns
# # create a pipeline that applies the transformations to the dataset
# pipeline = ColumnTransformer(
#     [
#         ("categorical", SimpleImputer(strategy="most_frequent"), categorical_features),
#         ("numerical", SimpleImputer(strategy="mean"), numerical_features)
#     ],
#     remainder="passthrough"
# )
# # apply transformations to both training and test
# # X_train = pd.DataFrame(pipeline.fit_transform(X_train), columns=pipeline.get_feature_names_out())
# # X_test = pd.DataFrame(pipeline.transform(X_test))

In [129]:
# fill the numerical features with the mean in the training set
for col in numerical_features:
    X_train[col] = X_train[col].fillna(X_train[col].mean())

In [130]:
# fill the numerical features with the mean in the test data
for col in numerical_features:
    X_test[col] = X_test[col].fillna(X_train[col].mean())

In [131]:
# fill in the categorical features with the mode
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="most_frequent")
X_train[:] = imputer.fit_transform(X_train)
X_test[:] = imputer.transform(X_test)

## Feature Selection

In [132]:
# chis = pd.concat([X_train, y_train], axis=1)
# corrmat = chis.corr()

In [133]:
# corrmat["T2D"].nlargest(12)

In [134]:
# features = corrmat["T2D"].nlargest(12)[2:].index
# features

In [135]:
# # checks if there are any NaN values in any of the features
# X_train[features].isnull().sum()

In [136]:
# # see if number of features are the same for training and testing 
# X_train = X_train[features]
# X_test = X_test[features]
# print(X_train.shape)
# print(X_test.shape)

In [137]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
sel = SelectFromModel(RandomForestClassifier(random_state=0))
sel.fit(X_train, y_train)
selected_feat= X_train.columns[(sel.get_support())]
print(len(selected_feat))
print(selected_feat)

52
Index(['AB1', 'AB118', 'AB29V2', 'AB30', 'AB34', 'AB41', 'AB63', 'AB99', 'AE2',
       'AE3', 'AE7', 'AJ80', 'AK1', 'UNINSANY', 'ACMDNUM', 'AE_FRUIT',
       'AE_VEGI', 'AKWKLNG', 'ASTCUR', 'MARIT_45', 'DSTRSYR', 'INST_12',
       'RBMI', 'OVRWT', 'AK10_P', 'HGHTI_P', 'WGHTP_P', 'BMI_P', 'INS64_P',
       'INSTYPE', 'POVGWD_P1', 'DSTRS_P1', 'SRAGE_P1', 'AHEDC_P1', 'WRKST_P1',
       'AK2_P1', 'AB27_P1', 'AB28_P1', 'YEAR', 'AB26_P1', 'AC46', 'DISTRESS',
       'AK7_P1', 'DIABCK_P1', 'AC47', 'PREDIAB', 'POVLL2_P1V2', 'AK22_P1',
       'AH5', 'AH132', 'TCURPLAN', 'AB27_P'],
      dtype='object')


In [138]:
# model = RandomForestClassifier()
# model.fit(X_train, y_train)
# #plot graph of feature importances for better visualization
# feat_importances = pd.Series(model.feature_importances_, index=X_train.columns)
# feat_importances.nlargest(10).plot(kind='barh')
# plt.show()

In [141]:
X_train = X_train[selected_feat]
X_test = X_test[selected_feat]
X_train.shape

(132386, 52)

## Convert training and test data to CSV files

In [140]:
X_train.to_csv('data\X_train.csv', index = False)
X_test.to_csv('data\X_test.csv', index = False)
y_train.to_csv('data\y_train.csv', index = False)
y_test.to_csv('data\y_test.csv', index = False)