<a href="https://colab.research.google.com/github/Laura-Neff/FeatureImputationPipeline/blob/main/FeatureImputationPipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer

In [5]:
diabetes = pd.read_csv('diabetes_processed.csv') #In order to fit classification model, we will work with this file and have to introduce missing values

diabetes.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,219.028414,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,70.34155,26.6,0.351,31.0,0
2,8.0,183.0,64.0,32.0,270.573172,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1
5,5.0,116.0,74.0,32.0,127.840221,25.6,0.201,30.0,0
6,3.0,78.0,50.0,32.0,88.0,31.0,0.248,26.0,1
7,10.0,115.0,72.405184,32.0,136.570245,35.3,0.134,29.0,0
8,2.0,197.0,70.0,45.0,543.0,30.5,0.158,53.0,1
9,8.0,125.0,96.0,32.0,155.57148,32.0,0.232,54.0,1


In [6]:
diabetes_features = diabetes.drop('Outcome', axis=1)
diabetes_label = diabetes[['Outcome']] #label = output = result of several different factors 

diabetes_features.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,35.0,219.028414,33.6,0.627,50.0
1,1.0,85.0,66.0,29.0,70.34155,26.6,0.351,31.0
2,8.0,183.0,64.0,32.0,270.573172,23.3,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0


In [7]:
mask = np.random.randint(0, 100, size=diabetes_features.shape).astype(bool) #Use this to randomly introduce missing values

mask = np.logical_not(mask)

#Making a mask data frame the same size as our diabetes data frame and we will introduce random values between 0 and 100 
#When we say astype(bool), we are telling the matrix to convert all non-zero values to 1 and all zero values to 0; non-zero = true, 0 = false
#Then the np.logical_not(mask) gets the inverse so that 0 values will be true and non-zero values will be false
#This will give you a boolean mask with random true/false vals

#So 1-100 vals will be true

In [8]:
diabetes_features[mask] = np.nan

diabetes_features.sample(15)

#where all the 0 values exists=where all the true vals are, set to nan



Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
337,5.0,115.0,76.0,32.0,140.287466,31.2,0.343,44.0
414,0.0,138.0,60.0,35.0,167.0,34.6,0.534,21.0
512,9.0,91.0,68.0,32.0,87.480813,24.2,0.2,58.0
253,0.0,86.0,68.0,32.0,83.293682,35.8,0.238,25.0
103,1.0,81.0,72.0,18.0,40.0,26.6,0.283,24.0
117,5.0,78.0,48.0,32.0,69.706131,33.7,0.654,25.0
465,0.0,124.0,56.0,13.0,105.0,21.8,0.452,
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
269,2.0,146.0,72.405184,32.0,196.271213,,0.24,28.0
109,0.0,,85.0,25.0,36.0,37.4,0.247,24.0


In [9]:
from sklearn.pipeline import make_pipeline #Import this to create a pipeline
from sklearn.compose import ColumnTransformer #will apply transformations on our values

from sklearn.model_selection import train_test_split #We will fit a decision tree classifier on our dataset after imputing missing values
from sklearn.tree import DecisionTreeClassifier

In [11]:
x_train, x_test, y_train, y_test = train_test_split(diabetes_features, diabetes_label, test_size=0.2)
#We want to make a machine learning model that tests all of these features vs. the outcome 

In [12]:
transformer = ColumnTransformer(
     transformers=[('features', SimpleImputer(strategy='mean'), [0, 1, 2, 3, 4, 5, 6, 7])] 
     #apply this transformation for columns up to and including 0:7 
)
#Transform our data to impute missing values 
#Column transformer allows you to pass a sequence of transformations that you can apply on your data 
#only transformation we are using here is to replace missing values of each feature with the mean for that feature

In [13]:
transformer.fit_transform(x_train) #apply transformations 

array([[1.0000000e+00, 1.1200000e+02, 8.0000000e+01, ..., 3.4800000e+01,
        2.1700000e-01, 2.4000000e+01],
       [9.0000000e+00, 1.2300000e+02, 7.0000000e+01, ..., 3.3100000e+01,
        3.7400000e-01, 4.0000000e+01],
       [3.7338843e+00, 8.5000000e+01, 5.8000000e+01, ..., 2.7800000e+01,
        3.0600000e-01, 2.8000000e+01],
       ...,
       [1.0000000e+00, 1.8900000e+02, 6.0000000e+01, ..., 3.0100000e+01,
        3.9800000e-01, 3.2839404e+01],
       [2.0000000e+00, 8.7000000e+01, 5.8000000e+01, ..., 3.2700000e+01,
        1.6600000e-01, 2.5000000e+01],
       [1.0000000e+00, 8.8000000e+01, 3.0000000e+01, ..., 5.5000000e+01,
        4.9600000e-01, 2.6000000e+01]])

In [14]:
clf = make_pipeline(transformer, DecisionTreeClassifier(max_depth=4)) 
#create pipeline to impute missing values
#fit DecisionTreeClassifier on this data 
#pass in transformer and our classifier object 

#This is the object that represents we want to perform classification techniques for machine learning 
#This is our classification model

In [15]:
clf = clf.fit(x_train, y_train) #Now we are going to train our classification model
#Tell it to impute missing values with the mean for columns 0:7

clf.score(x_train, y_train)

#Model is predicting very well with training data 

0.8013029315960912

In [16]:
y_pred = clf.predict(x_test)

In [17]:
from sklearn.metrics import accuracy_score

accuracy_score(y_pred, y_test)

#Performing pretty well still

0.7922077922077922