# PCA Excercise (Core)

### Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

pd.set_option('display.max_columns',200)
pd.set_option("display.max_info_rows", 800)
pd.set_option('display.max_info_columns',800)

from sklearn import set_config
set_config(transform_output='pandas')

import warnings
warnings.filterwarnings("ignore")

### Functions

In [2]:
def classification_metrics(y_true, y_pred, label='',
                           output_dict=False, figsize=(8,4),
                           normalize='true', cmap='Blues',
                           colorbar=False):
    # Get the classification report
    report = classification_report(y_true, y_pred)
    ## Print header and report
    header = "-"*70
    print(header, f" Classification Metrics: {label}", header, sep='\n')
    print(report)
    ## CONFUSION MATRICES SUBPLOTS
    fig, axes = plt.subplots(ncols=2, figsize=figsize)
    # create a confusion matrix  of raw counts
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred,
                normalize=None, cmap='gist_gray', colorbar=colorbar,
                ax = axes[0],);
    axes[0].set_title("Raw Counts")
    # create a confusion matrix with the test data
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred,
                normalize=normalize, cmap=cmap, colorbar=colorbar,
                ax = axes[1]);
    axes[1].set_title("Normalized Confusion Matrix")
    # Adjust layout and show figure
    fig.tight_layout()
    plt.show()
    # Return dictionary of classification_report
    if output_dict==True:
        report_dict = classification_report(y_true, y_pred, output_dict=True)
        return report_dict

In [3]:
def evaluate_classification(model, X_train, y_train, X_test, y_test,
                         figsize=(6,4), normalize='true', output_dict = False,
                            cmap_train='Blues', cmap_test="Reds",colorbar=False):
    # Get predictions for training data
    y_train_pred = model.predict(X_train)
    # Call the helper function to obtain regression metrics for training data
    results_train = classification_metrics(y_train, y_train_pred, #verbose = verbose,
                                     output_dict=True, figsize=figsize,
                                         colorbar=colorbar, cmap=cmap_train,
                                     label='Training Data')
    print()
    # Get predictions for test data
    y_test_pred = model.predict(X_test)
    # Call the helper function to obtain regression metrics for test data
    results_test = classification_metrics(y_test, y_test_pred, #verbose = verbose,
                                  output_dict=True,figsize=figsize,
                                         colorbar=colorbar, cmap=cmap_test,
                                    label='Test Data' )
    if output_dict == True:
        # Store results in a dataframe if ouput_frame is True
        results_dict = {'train':results_train,
                    'test': results_test}
        return results_dict

### Data Prep

In [4]:
df = pd.read_csv('data/NHANES_Diet_Diabetes_Risk.xls.csv')
df = df.set_index("SEQN")
df.head()

Unnamed: 0_level_0,WTDRD1,WTDR2D,DR1DRSTZ,DR1EXMER,DRABF,DRDINT,DR1DBIH,DR1DAY,DR1LANG,DR1MNRSP,DR1HELPD,DBQ095Z,DBD100,DRQSPREP,DR1STY,DR1SKY,DRQSDIET,DRQSDT1,DRQSDT2,DRQSDT3,DRQSDT4,DRQSDT5,DRQSDT6,DRQSDT7,DRQSDT8,DRQSDT9,DRQSDT10,DRQSDT11,DRQSDT12,DRQSDT91,DR1TNUMF,DR1TKCAL,DR1TPROT,DR1TCARB,DR1TSUGR,DR1TFIBE,DR1TTFAT,DR1TSFAT,DR1TMFAT,DR1TPFAT,DR1TCHOL,DR1TATOC,DR1TATOA,DR1TRET,DR1TVARA,DR1TACAR,DR1TBCAR,DR1TCRYP,DR1TLYCO,DR1TLZ,DR1TVB1,DR1TVB2,DR1TNIAC,DR1TVB6,DR1TFOLA,DR1TFA,DR1TFF,DR1TFDFE,DR1TCHL,DR1TVB12,DR1TB12A,DR1TVC,DR1TVD,DR1TVK,DR1TCALC,DR1TPHOS,DR1TMAGN,DR1TIRON,DR1TZINC,DR1TCOPP,DR1TSODI,DR1TPOTA,DR1TSELE,DR1TCAFF,DR1TTHEO,DR1TALCO,DR1TMOIS,DR1TS040,DR1TS060,DR1TS080,DR1TS100,DR1TS120,DR1TS140,DR1TS160,DR1TS180,DR1TM161,DR1TM181,DR1TM201,DR1TM221,DR1TP182,DR1TP183,DR1TP184,DR1TP204,DR1TP205,DR1TP225,DR1TP226,DR1.300,DR1.320Z,DR1.330Z,DR1BWATZ,DR1TWS,DRD340,DRD350A,DRD350AQ,DRD350B,DRD350BQ,DRD350C,DRD350CQ,DRD350D,DRD350DQ,DRD350E,DRD350EQ,DRD350F,DRD350FQ,DRD350G,DRD350GQ,DRD350H,DRD350HQ,DRD350I,DRD350IQ,DRD350J,DRD350JQ,DRD350K,DRD360,DRD370A,DRD370AQ,DRD370B,DRD370BQ,DRD370C,DRD370CQ,DRD370D,DRD370DQ,DRD370E,DRD370EQ,DRD370F,DRD370FQ,DRD370G,DRD370GQ,DRD370H,DRD370HQ,DRD370I,DRD370IQ,DRD370J,DRD370JQ,DRD370K,DRD370KQ,DRD370L,DRD370LQ,DRD370M,DRD370MQ,DRD370N,DRD370NQ,DRD370O,DRD370OQ,DRD370P,DRD370PQ,DRD370Q,DRD370QQ,DRD370R,DRD370RQ,DRD370S,DRD370SQ,DRD370T,DRD370TQ,DRD370U,DRD370UQ,DRD370V,Diabetes_Risk
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1
73557,16888.32786,12930.89065,1,49.0,2.0,2.0,6.0,2.0,1.0,1.0,13.0,3.0,2.0,4.0,2.0,,2.0,,,,,,,,,,,,,,11.0,1574.0,43.63,239.59,176.47,10.8,52.81,17.819,18.493,8.829,209.0,3.92,0.0,70.0,110.0,60.0,304.0,278.0,1398.0,430.0,1.057,1.216,11.804,0.951,285.0,66.0,219.0,330.0,216.3,2.79,0.0,241.4,3.3,29.2,949.0,756.0,206.0,8.41,8.85,1.072,1323.0,2228.0,62.5,203.0,36.0,0.0,2701.79,0.219,0.099,0.103,0.153,0.346,1.221,9.694,5.408,1.173,16.796,0.121,0.001,7.932,0.769,0.0,0.094,0.001,0.0,0.01,2.0,960.0,960.0,0.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2
73558,17932.14387,12684.14887,1,59.0,2.0,2.0,4.0,1.0,1.0,1.0,13.0,1.0,2.0,3.0,1.0,1.0,2.0,,,,,,,,,,,,,,8.0,5062.0,338.13,423.78,44.99,16.7,124.29,53.408,35.481,20.505,2584.0,20.94,0.0,1477.0,1488.0,0.0,145.0,12.0,0.0,899.0,4.178,5.168,65.396,4.415,1243.0,601.0,641.0,1662.0,2149.0,21.45,0.0,2.3,15.2,23.4,3193.0,6427.0,903.0,26.88,30.36,4.13,9726.0,4930.0,715.8,240.0,0.0,119.0,6779.99,1.872,1.16,0.802,1.286,1.297,6.516,27.952,12.107,2.208,32.429,0.086,0.0,15.483,1.774,0.007,0.728,1.003,0.104,1.139,2.0,360.0,360.0,0.0,1.0,1.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,1.0,1.0,1.0,1.0,2.0,,2.0,,2.0,1.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,1.0,1.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,2
73559,59641.81293,39394.23671,1,49.0,2.0,2.0,18.0,6.0,1.0,1.0,13.0,1.0,1.0,2.0,2.0,,1.0,,,,,,,7.0,,,,,,,27.0,1743.0,64.61,224.39,102.9,9.9,65.97,25.263,20.902,12.953,88.0,8.87,0.0,616.0,655.0,21.0,449.0,35.0,515.0,300.0,1.648,1.747,18.342,1.906,423.0,290.0,133.0,625.0,161.2,3.78,1.42,195.1,4.0,40.4,877.0,1198.0,210.0,17.57,8.98,0.949,2943.0,1694.0,98.3,45.0,71.0,0.0,3766.7,0.358,0.128,0.371,0.455,3.008,2.067,10.691,7.497,0.531,19.91,0.164,0.005,11.705,1.092,0.0,0.042,0.001,0.006,0.002,2.0,1254.0,0.0,1254.0,1.0,1.0,2.0,,2.0,,2.0,,2.0,,1.0,1.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2
73561,59052.35703,39004.89299,1,63.0,2.0,2.0,18.0,1.0,1.0,1.0,13.0,4.0,,4.0,2.0,,1.0,,,,,,,,,,,,,91.0,7.0,1421.0,55.24,178.2,87.78,12.3,55.36,4.479,26.216,1.263,41.0,53.17,0.0,738.0,863.0,0.0,1476.0,0.0,0.0,0.0,1.484,1.72,15.857,2.757,390.0,389.0,1.0,667.0,218.1,8.3,8.29,236.5,23.5,126.0,1410.0,1182.0,396.0,17.72,17.73,1.984,797.0,1445.0,68.7,24.0,0.0,0.0,1029.69,0.017,0.005,0.003,0.008,0.009,0.069,1.506,1.649,0.038,22.673,0.268,0.0,1.048,0.062,0.0,0.0,0.0,0.0,0.0,2.0,240.0,0.0,240.0,4.0,1.0,2.0,,2.0,,1.0,1.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,1.0,2.0,,1.0,1.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,0
73562,49890.82866,0.0,1,49.0,2.0,1.0,11.0,3.0,1.0,1.0,13.0,1.0,3.0,3.0,1.0,1.0,2.0,,,,,,,,,,,,,,17.0,1785.0,55.11,189.59,81.75,22.6,93.92,22.155,40.013,23.55,534.0,12.3,0.0,272.0,355.0,172.0,776.0,300.0,2339.0,2101.0,1.227,1.92,17.119,1.892,458.0,76.0,384.0,511.0,465.7,1.68,0.0,224.4,6.0,103.9,1156.0,1021.0,243.0,9.52,5.96,1.02,2651.0,3002.0,84.0,144.0,0.0,0.0,2412.08,0.241,0.078,0.091,0.18,0.151,1.031,15.065,4.633,2.129,37.405,0.248,0.006,21.036,2.012,0.0,0.262,0.005,0.013,0.063,1.0,1014.0,0.0,1014.0,1.0,1.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,1.0,1.0,2.0,,2.0,,2.0,1.0,1.0,1.0,1.0,2.0,2.0,,2.0,,1.0,1.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6643 entries, 73557 to 83729
Data columns (total 168 columns):
 #    Column         Dtype  
---   ------         -----  
 0    WTDRD1         float64
 1    WTDR2D         float64
 2    DR1DRSTZ       int64  
 3    DR1EXMER       float64
 4    DRABF          float64
 5    DRDINT         float64
 6    DR1DBIH        float64
 7    DR1DAY         float64
 8    DR1LANG        float64
 9    DR1MNRSP       float64
 10   DR1HELPD       float64
 11   DBQ095Z        float64
 12   DBD100         float64
 13   DRQSPREP       float64
 14   DR1STY         float64
 15   DR1SKY         float64
 16   DRQSDIET       float64
 17   DRQSDT1        float64
 18   DRQSDT2        float64
 19   DRQSDT3        float64
 20   DRQSDT4        float64
 21   DRQSDT5        float64
 22   DRQSDT6        float64
 23   DRQSDT7        float64
 24   DRQSDT8        float64
 25   DRQSDT9        float64
 26   DRQSDT10       float64
 27   DRQSDT11       float64
 28   DRQSDT12   

In [6]:
df.dtypes.value_counts()

float64    166
int64        2
dtype: int64

In [7]:
df.isna().sum().sum()

432597

In [8]:
# Define Target, X, Y
target = 'Diabetes_Risk'
y = df[target].copy()
X = df.drop(columns=target).copy()

In [9]:
# Train/Test Split
X_train,X_test,y_train,y_test = train_test_split(X,y, random_state=321)
X_train.head()

Unnamed: 0_level_0,WTDRD1,WTDR2D,DR1DRSTZ,DR1EXMER,DRABF,DRDINT,DR1DBIH,DR1DAY,DR1LANG,DR1MNRSP,DR1HELPD,DBQ095Z,DBD100,DRQSPREP,DR1STY,DR1SKY,DRQSDIET,DRQSDT1,DRQSDT2,DRQSDT3,DRQSDT4,DRQSDT5,DRQSDT6,DRQSDT7,DRQSDT8,DRQSDT9,DRQSDT10,DRQSDT11,DRQSDT12,DRQSDT91,DR1TNUMF,DR1TKCAL,DR1TPROT,DR1TCARB,DR1TSUGR,DR1TFIBE,DR1TTFAT,DR1TSFAT,DR1TMFAT,DR1TPFAT,DR1TCHOL,DR1TATOC,DR1TATOA,DR1TRET,DR1TVARA,DR1TACAR,DR1TBCAR,DR1TCRYP,DR1TLYCO,DR1TLZ,DR1TVB1,DR1TVB2,DR1TNIAC,DR1TVB6,DR1TFOLA,DR1TFA,DR1TFF,DR1TFDFE,DR1TCHL,DR1TVB12,DR1TB12A,DR1TVC,DR1TVD,DR1TVK,DR1TCALC,DR1TPHOS,DR1TMAGN,DR1TIRON,DR1TZINC,DR1TCOPP,DR1TSODI,DR1TPOTA,DR1TSELE,DR1TCAFF,DR1TTHEO,DR1TALCO,DR1TMOIS,DR1TS040,DR1TS060,DR1TS080,DR1TS100,DR1TS120,DR1TS140,DR1TS160,DR1TS180,DR1TM161,DR1TM181,DR1TM201,DR1TM221,DR1TP182,DR1TP183,DR1TP184,DR1TP204,DR1TP205,DR1TP225,DR1TP226,DR1.300,DR1.320Z,DR1.330Z,DR1BWATZ,DR1TWS,DRD340,DRD350A,DRD350AQ,DRD350B,DRD350BQ,DRD350C,DRD350CQ,DRD350D,DRD350DQ,DRD350E,DRD350EQ,DRD350F,DRD350FQ,DRD350G,DRD350GQ,DRD350H,DRD350HQ,DRD350I,DRD350IQ,DRD350J,DRD350JQ,DRD350K,DRD360,DRD370A,DRD370AQ,DRD370B,DRD370BQ,DRD370C,DRD370CQ,DRD370D,DRD370DQ,DRD370E,DRD370EQ,DRD370F,DRD370FQ,DRD370G,DRD370GQ,DRD370H,DRD370HQ,DRD370I,DRD370IQ,DRD370J,DRD370JQ,DRD370K,DRD370KQ,DRD370L,DRD370LQ,DRD370M,DRD370MQ,DRD370N,DRD370NQ,DRD370O,DRD370OQ,DRD370P,DRD370PQ,DRD370Q,DRD370QQ,DRD370R,DRD370RQ,DRD370S,DRD370SQ,DRD370T,DRD370TQ,DRD370U,DRD370UQ,DRD370V
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1
74781,34584.68759,0.0,1,49.0,2.0,1.0,20.0,5.0,1.0,1.0,13.0,1.0,1.0,4.0,2.0,,1.0,,,,,,,7.0,,,,,,,20.0,1337.0,35.82,159.57,82.6,13.4,65.13,14.519,22.719,23.924,250.0,6.89,0.0,159.0,706.0,2030.0,5500.0,76.0,3012.0,1173.0,0.803,1.083,11.731,0.937,216.0,67.0,149.0,263.0,249.2,1.57,0.0,81.5,2.8,128.7,371.0,759.0,165.0,7.28,4.49,0.64,2160.0,1623.0,50.6,75.0,8.0,0.0,1370.86,0.124,0.064,0.042,0.083,0.094,0.428,8.899,4.285,0.362,22.081,0.212,0.003,21.184,2.452,0.0,0.148,0.005,0.009,0.037,3.0,507.0,0.0,507.0,1.0,1.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,1.0,1.0,2.0,,2.0,,2.0,1.0,2.0,,1.0,1.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0
76977,15205.56434,24452.37379,1,2.0,2.0,2.0,9.0,5.0,1.0,1.0,13.0,1.0,3.0,4.0,1.0,1.0,2.0,,,,,,,,,,,,,,7.0,1561.0,75.97,140.12,46.73,7.7,67.21,21.706,26.472,13.569,251.0,6.98,0.0,180.0,188.0,5.0,73.0,10.0,157.0,318.0,0.858,2.006,24.328,1.302,225.0,67.0,159.0,272.0,255.6,2.34,0.0,4.5,4.9,17.4,799.0,1269.0,177.0,6.63,5.43,0.591,3318.0,1691.0,118.7,3.0,78.0,14.0,904.93,0.48,0.311,0.209,0.446,0.782,1.989,11.369,4.806,1.374,24.223,0.456,0.051,11.68,1.572,0.002,0.148,0.008,0.015,0.009,3.0,0.0,0.0,0.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,1.0,5.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0
79530,4940.540826,13796.38144,1,49.0,2.0,2.0,23.0,6.0,1.0,1.0,13.0,1.0,2.0,3.0,2.0,,2.0,,,,,,,,,,,,,,14.0,1748.0,47.34,256.44,131.76,9.3,60.71,24.015,23.084,10.053,143.0,4.93,1.27,640.0,649.0,0.0,110.0,5.0,0.0,255.0,1.317,1.913,18.543,1.824,271.0,197.0,73.0,408.0,157.5,3.65,1.69,37.7,3.7,41.1,760.0,1231.0,206.0,13.08,6.42,0.783,5654.0,1593.0,68.9,9.0,191.0,0.0,1202.76,0.422,0.263,0.262,0.458,1.875,2.031,12.345,5.729,1.34,21.233,0.259,0.025,8.849,1.089,0.004,0.042,0.011,0.005,0.008,2.0,0.0,0.0,0.0,1.0,1.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,1.0,2.0,2.0,,2.0,,2.0,1.0,1.0,1.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0
74171,5061.660951,3885.556268,1,49.0,2.0,2.0,11.0,7.0,1.0,1.0,13.0,2.0,1.0,2.0,2.0,,2.0,,,,,,,,,,,,,,7.0,704.0,45.36,95.15,18.92,6.7,14.48,5.736,4.487,2.02,88.0,0.83,0.0,53.0,59.0,0.0,61.0,5.0,0.0,106.0,0.645,0.471,13.083,1.008,166.0,92.0,74.0,231.0,117.4,1.91,0.0,7.1,1.7,3.5,459.0,660.0,130.0,6.09,7.54,0.661,1700.0,739.0,73.9,0.0,0.0,0.0,2433.96,0.099,0.08,0.054,0.127,0.143,0.595,2.987,1.5,0.265,4.074,0.053,0.0,1.731,0.212,0.001,0.041,0.004,0.011,0.002,3.0,2160.0,2160.0,0.0,2.0,1.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,1.0,2.0,2.0,,2.0,,2.0,1.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,1.0,1.0,2.0,,2.0,,2.0,,2.0
83046,16431.83563,12058.65418,1,59.0,2.0,2.0,9.0,6.0,1.0,1.0,13.0,4.0,,2.0,2.0,,2.0,,,,,,,,,,,,,,13.0,902.0,40.82,100.24,49.3,8.5,38.63,14.053,10.871,10.534,115.0,3.45,0.0,379.0,726.0,131.0,4100.0,2.0,2618.0,336.0,1.265,1.416,14.508,1.447,198.0,97.0,100.0,266.0,160.5,3.41,1.4,69.7,4.2,186.6,627.0,707.0,156.0,10.81,4.63,0.434,2166.0,1658.0,56.3,0.0,0.0,0.0,1326.84,0.568,0.324,0.261,0.494,0.461,1.59,7.009,3.063,0.392,10.268,0.132,0.007,9.0,1.446,0.0,0.048,0.002,0.002,0.0,3.0,720.0,0.0,720.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,1.0,2.0,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0,,2.0


In [10]:
# Pipeline
impute_median = SimpleImputer(strategy='median')
scaler = StandardScaler()
pipe = make_pipeline(impute_median, scaler)

In [11]:
# Fit
pipe.fit(X_train)

In [13]:
# Transform
X_train_tf = pipe.transform(X_train)
X_train_tf.isna().sum().sum()

0

### Model - No PCA

In [14]:
# Fit KNN
knn = KNeighborsClassifier()
knn.fit(X_train_tf, y_train)

In [17]:
%%time
preds = knn.predict(X_test)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- DRQSDT5


In [16]:
evaluate_classification(knn, X_train_tf,y_train, X_test, y_test)

NameError: name 'X_test_tf' is not defined

### PCA Transform / Model

In [None]:
# fitting data to preprocessing pipeline
pipe.fit(df)
scaled_df = pipe.transform(df)
scaled_df.isna().sum().sum()

In [None]:
# Define Target, X, Y
target = 'Diabetes_Risk'
y = scaled_df[target].copy()
X = scaled_df.drop(columns=target).copy()

In [None]:
# Train/Test Split
X_train_pca,X_test_pca,y_train_pca,y_test_pca = train_test_split(X,y, random_state=321)
X_train_pca.head()

In [None]:
pca = PCA(n_components=.95)
X_train_pca = pca.fit_transform(X_train_tf)
X_test_pca = pca.transform(X_test_tf)

In [None]:
pca.n_components_

In [None]:
# Fit KNN
knn_pca = KNeighborsClassifier()
knn_pca.fit(X_train_pca,y_train)

In [None]:
%%time
preds_pca = knn_pca.predict(X_test_pca)

In [None]:
evaluate_classification(knn_pca, X_train_pca,y_train, X_test_pca, y_test)

### Comparing Models

##### How many features were used in the first model, and how many PCs were used in the PCA model?
- 168 in the first model

- 99 in the PCA model

##### Which model performed the best on the test set?
- Both Models performed similarly

##### Which model was the fastest at making predictions?
- In this case they performed in very similar time, with the non-PCA model slightly faster