
# Pre-Processing Exercise
Submitted by kenneth Alaba

## Loading the Data

In [43]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [44]:
# Load the data

## Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

## Opening the file
filename = '/content/drive/My Drive/Coding Dojo/05 Week 5: Intro to Machine Learning/insurance.csv'

## Storing the data in df
df = pd.read_csv(filename)

# display first few rows
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


By observing the columns and their contents, we can say that the columns sex, smoker, and region, being categorical data, are all considered to be nominal. Additionally, these columns have object datatype. The rest of the columns which include age, bmi, children, and charges are all numerical types of data.

In [45]:
# show df information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Since we want to predict the charge based on the other given patient information, we have the column charges as the target (X) and the rest of the columns as the features (y). Additionally, since all the nominal data are in the features, we only need to OneHotEncode it later and not the target.

## Train Test Split

In [46]:
# split the df into features and targets
X = df[['charges']]
y = df.drop('charges', axis = 1)

In [47]:
#import sklearn
from sklearn.model_selection import train_test_split

In [48]:
# split the features and targets into train and test sets
X_train, X_test, y_train, y_test  = train_test_split(X, y, random_state=42)

## OneHotEncode

In [49]:
# import additional libraries
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [50]:
# create categorical/nominal selector
cat_selector = make_column_selector(dtype_include='object')

In [51]:
# select categorical columns in y_train
cat_ytrain = y_train[cat_selector(y_train)]

# OneHotEncode, encode the categories
ohe_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe_encoder.fit(cat_ytrain)
cat_ohe = ohe_encoder.transform(cat_ytrain) # returns an array

# converts the array into dataframe
cat_ytrain = pd.DataFrame(cat_ohe, columns=ohe_encoder.get_feature_names(cat_ytrain.columns))
cat_ytrain.head()



Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


Checking the dataframe for inconsistencies wasn't included in this notebook but we can also see from here that the data has no inconcistencies.

In [52]:
# Do the same for y_test

# select categorical columns in y_test
cat_ytest = y_test[cat_selector(y_test)]

# OneHotEncode, encode the categories
ohe_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe_encoder.fit(cat_ytest)
cat_ohe = ohe_encoder.transform(cat_ytest) # returns an array

# converts the array into dataframe
cat_ytest = pd.DataFrame(cat_ohe, columns=ohe_encoder.get_feature_names(cat_ytest.columns))
cat_ytest.head()



Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
