In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

#Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt


#Modeling

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder



from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

#Evaluation
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


from scipy import stats

#Set figure size and figure size for all plots
plt.rc("figure", figsize = (16,16))
plt.rc("font", size=14)

# Allow all columns to be displayed
pd.set_option('display.max_columns', None)

# Project purpose

The goal of this project is to use features captured from survey data to predict how likely individuals are to receive their H1N1 and seasonal flu vaccines.

Specifically, I will be predicting two probabilities:
- h1n1_vaccine - Whether respondent received H1N1 flu vaccine.
- seasonal_vaccine - Whether respondent received seasonal flu vaccine.
> Both are binary variables: 0 = No; 1 = Yes. Some respondents didn't get either vaccine, others got only one, and some got both. This is formulated as a multilabel (and not multiclass) problem.

# Project Plan

1. Aquisition
  * Download data into local drive
2. Prepare
  * Read in data csv using pandas
  * Chekck data types and null values
  * Fill in nulls
  * Encode appropriately
  * Scale if needed
3. Explore
4. Modeling
5. Conclusions

# Acquire

- I downloaded the data from https://www.drivendata.org/competitions/66/flu-shot-learning/data/ into the same file where this analysis is being conducted
- Now I'll turn both the feature csv and the target variable csv data into a pandas dataframes for analysis and modeling

In [None]:
# Read in the training set feature csv using pandas
df = pd.read_csv('Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Features.csv', index_col=0)

Take a look at the data

In [None]:
df

In [None]:
target_variable_df = pd.read_csv('Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Labels.csv', index_col=0)

In [None]:
target_variable_df

- The target variable csv will need to be concatenated onto the feature csv

In [None]:
df = pd.concat([df, target_variable_df], axis = 1)

In [None]:
# df = df.drop(columns = 'respondent_id')

Verify it was added correctly

In [None]:
df

Success!

### The initial datatframe has:
- 26,707 rows where each row is one person
- 35 columns where each column is a feature this person has indicated on their survey


Let's get some more info about the datframe. We'll use .info() to see how many non-null values we have in each column and what the data types are

In [None]:
df.info()

### Initial thoughts for cleaning/prep
1. Several columns will need to be one hot encoded or label encoded
2. Nans in several columns needs to be handled
3. May need to drop certain columns
4. All the datatypes seem to be appropriate

# Prepare

I'll start by splitting the data into train and test to avoid exploring the test data which is meant to stay unseen. Then I'll perform the same clean and prep changes to each dataframe sequentially.
> There's enough data here to split this further into train and validate sets which will help to prevent overfitting by allowing signs of overfitting to be caught before applying the model to the test data. After the appropriate cleaning and preparation are complete, I will create a validate set.

Now I'll check to see which columns have nans, how many there are, and explore the best ways to fill those nans

In [None]:
#Write a for loop to determine what percentage of each column are nans
def percent_nans(df):
    x = ['column','n_nans', 'percentage_nans']
    missing_data_df = pd.DataFrame(columns=x)
    columns = df.columns
    for col in columns:
        column_name = col
        missing_data = df[col].isnull().sum()
        missing_in_percentage = (df[col].isnull().sum()/df[col].shape[0])*100
        
        missing_data_df.loc[len(missing_data_df)] = [column_name, missing_data, missing_in_percentage]
    return missing_data_df.sort_values(by = 'percentage_nans')

In [None]:
percent_nans(df)

- It looks like only three columns have a significant number of values missing:
  - __health_insurance__
  - __employment_industry__
  - __employment_occupation__
- Look at the most common values in each feature and decide if there is a reasonable way to fill the null values

In [None]:
#What is the most common health insurance?
df.health_insurance.value_counts()

In [None]:
#What is the most common employment_industry?
df.employment_industry.value_counts()

In [None]:
#What is the most common employment occupation?
df.employment_occupation.value_counts()

It is evident that __employment_industry__ and __employment_occupation__ do not have a single most overwhelming industry or occupation that could be reasonably used to fill in the remaining half empty values. I will drop these columns and not include them in the analysis

On the other hand, __health_insurance__ has 12,697 observation recoded as having insurance of the total 14,433 observations with values recorded. It may be reasonable to simply fill in the remaining 45% missing values with the label for having insurance.

At this point I need to split the data into train and test. I think it would be better for exploring and modeling purposes to have a train and test dataframe for each target variable. After the predictions are made, I will concat the dataframes together

In [None]:
df.seasonal_vaccine.value_counts()

In [None]:
#Create two dataframes each with only one of the target variables
h1n1_df = df.drop(columns = 'seasonal_vaccine')

seasonal_df = df.drop(columns = 'h1n1_vaccine')

In [None]:
# Use the train test split function from Sklearn and add a random seed for reproducibility
# Use Stratify y parameter to ensure the same proportion of the y variable in both train and test dfs
h1n1_train, h1n1_test = train_test_split(h1n1_df, random_state=123, train_size=.80, stratify=h1n1_df.h1n1_vaccine)

In [None]:
# Use the train test split function from Sklearn and add a random seed for reproducibility
# Stratify y parameter to ensure the same proportion of the y variable in both train and testt dfs
seasonal_train, seasonal_test = train_test_split(seasonal_df, random_state=123, train_size=.80, stratify=seasonal_df.seasonal_vaccine)

### Drop the employment_industry and employment_occupation columns from dataframes

In [None]:
h1n1_train = h1n1_train.drop(columns =['employment_industry', 'employment_occupation'] )
h1n1_test = h1n1_test.drop(columns =['employment_industry', 'employment_occupation'] )

Check to make sure the columns were dropped

In [None]:
h1n1_train.head(3)

In [None]:
h1n1_test.head(3)

In [None]:
seasonal_train = seasonal_train.drop(columns =['employment_industry', 'employment_occupation'] )
seasonal_test = seasonal_test.drop(columns =['employment_industry', 'employment_occupation'] )

In [None]:
seasonal_train.head(3)

In [None]:
seasonal_test.head(3)

It looks like the columns were dropped appropriately. Now let's fill in null values in the remaining columns

In [None]:
#Write a function to fill in the null values with the most common occurence
def fill_null_values(train, test):
    train = train.apply(lambda x:x.fillna(x.value_counts().index[0]))
    test = test.apply(lambda x:x.fillna(x.value_counts().index[0]))

    return train, test

In [None]:
h1n1_train, h1n1_test = fill_null_values(h1n1_train, h1n1_test)

Verify that there are no more null values in either dataframe

In [None]:
h1n1_train.isna().sum()

In [None]:
h1n1_test.isna().sum()

In [None]:
seasonal_train, seasonal_test = fill_null_values(seasonal_train, seasonal_test)

In [None]:
seasonal_train.isna().sum()

In [None]:
seasonal_test.isna().sum()

### Encoding

Next we need to encode the columns that have objects as values and turn them into integer representations for the purpose of the classification model to predict how likely people are to get their flu vaccines

Look at which columns need to be encoded

In [None]:
h1n1_train

In [None]:
# def label_encode_columns(train, test):

#     encoder = LabelEncoder()
   
#     encode_list = ['rent_or_own', 'employment_status', 'marital_status', 'sex']
    
             
#     for column in encode_list:
#         train[column] = encoder.fit_transform(train[column])
#         test[column] = encoder.transform(test[column])

#         return train, test

In [None]:
# train, test = label_encode_columns(train, test)

In [None]:
def label_encode_columns(h1n1_train, h1n1_test, seasonal_train, seasonal_test):
    '''
    Takes in train and test dataframes and label encodes columns.
    Returns train and test dataframes with new columns label encoded.
    '''
    # Create the encoder object
    encoder = LabelEncoder()

    # Add a new column to the dataframe that is the column you want, label encoded
    h1n1_train['encoded_employment_status'] = encoder.fit_transform(h1n1_train['employment_status'])
    h1n1_train['encoded_rent_or_own'] = encoder.fit_transform(h1n1_train['rent_or_own'])
    h1n1_train['encoded_marital_status'] = encoder.fit_transform(h1n1_train['marital_status'])
    h1n1_train['encoded_sex'] = encoder.fit_transform(h1n1_train['sex'])

    h1n1_test['encoded_employment_status'] = encoder.fit_transform(h1n1_test['employment_status'])
    h1n1_test['encoded_rent_or_own'] = encoder.fit_transform(h1n1_test['rent_or_own'])
    h1n1_test['encoded_marital_status'] = encoder.fit_transform(h1n1_test['marital_status'])
    h1n1_test['encoded_sex'] = encoder.fit_transform(h1n1_test['sex'])

    seasonal_train['encoded_employment_status'] = encoder.fit_transform(seasonal_train['employment_status'])
    seasonal_train['encoded_rent_or_own'] = encoder.fit_transform(seasonal_train['rent_or_own'])
    seasonal_train['encoded_marital_status'] = encoder.fit_transform(seasonal_train['marital_status'])
    seasonal_train['encoded_sex'] = encoder.fit_transform(seasonal_train['sex'])

    seasonal_test['encoded_employment_status'] = encoder.fit_transform(seasonal_test['employment_status'])
    seasonal_test['encoded_rent_or_own'] = encoder.fit_transform(seasonal_test['rent_or_own'])
    seasonal_test['encoded_marital_status'] = encoder.fit_transform(seasonal_test['marital_status'])
    seasonal_test['encoded_sex'] = encoder.fit_transform(seasonal_test['sex'])
    
    return h1n1_train, h1n1_test, seasonal_train, seasonal_test

In [None]:
h1n1_train, h1n1_test, seasonal_train, seasonal_test = label_encode_columns(h1n1_train, h1n1_test, seasonal_train, seasonal_test)

In [None]:
h1n1_train

In [None]:
h1n1_test

In [None]:
seasonal_train, seasonal_test = label_encode_columns(seasonal_train, seasonal_test)

In [None]:
seasonal_train.isna().sum()

In [None]:
seasonal_test.isna().sum()

In [None]:
# categorical_cols = ['rent_or_own', 'employment_status', 'marital_status', 'sex']

# categorical_cols

# train[categorical_cols] = train[categorical_cols].apply(lambda col: encoder.fit_transform(col))

In [None]:
# def encode(train, test):
#     # creating instance of one-hot-encoder
#     enc = OneHotEncoder()
#     # passing bridge-types-cat column (label encoded values of bridge_types)
#     enc_df = pd.DataFrame(enc.fit_transform(train[['age_group', 'education', 'race', 'income_poverty']]).toarray())
#     # merge with main df bridge_df on key values
#     train = train.join(enc_df)

#     # passing bridge-types-cat column (label encoded values of bridge_types)
#     enc_df2 = pd.DataFrame(enc.fit_transform(test[['age_group', 'education', 'race', 'income_poverty']]).toarray())
#     # merge with main df bridge_df on key values
#     test = test.join(enc_df2)
    
#     return train, test

In [None]:
# col_name = ['age_group', 'education', 'race', 'income_poverty']

In [None]:
# train, test = encode(train, test)

Look at the dataframe to ensure all the label encoded columns were added correctly

In [None]:
h1n1_train.head(3)

In [None]:
h1n1_test.head(3)

In [None]:
seasonal_train.head(3)

In [None]:
seasonal_train.head(3)

Check the origional columns against the encoded ones to be clear which labels correspond to eachother

In [None]:
housing_df = h1n1_train[['rent_or_own', 'encoded_rent_or_own']]
housing_df

#### Own == 0
#### Rent == 1

In [None]:
married_df = h1n1_train[['marital_status', 'encoded_marital_status']]
married_df

#### Married == 0
#### Not Married == 1

In [None]:
gender_df = h1n1_train[['sex', 'encoded_sex']]
gender_df

#### Female == 0 
#### Male == 1

### One Hot Encode Remaining Columns

In [None]:
# Encode age_group column

# Create encoder object
encoder = OneHotEncoder()

# Fit on the age_group column of the train df
encoder.fit(h1n1_train[['age_group']])

# nice columns for display
cols = ['age_group_' + c for c in encoder.categories_[0]]

# Transform the column on train and test and concatenate new df onto train and test dfs
m = encoder.transform(h1n1_train[['age_group']]).todense()
h1n1_train = pd.concat([
    h1n1_train,
    pd.DataFrame(m, columns=cols, index=h1n1_train.index)
], axis=1)

m = encoder.transform(h1n1_test[['age_group']]).todense()
h1n1_test = pd.concat([
    h1n1_test,
    pd.DataFrame(m, columns=cols, index=h1n1_test.index)
], axis=1)

Check to make sure it worked

In [None]:
h1n1_train

In [None]:
h1n1_test

Excellent! It worked the way I wanted. Now I have a column with a 1 if the observation falls into that category and a zero if it does not. 

Now repeat for the remaining columns

In [None]:
# Encode education column

# Create encoder object
encoder = OneHotEncoder()

# Fit on the age_group column of the train df
encoder.fit(h1n1_train[['education']])

# nice columns for display
cols = ['education_' + c for c in encoder.categories_[0]]

# Transform the column on train and test and concatenate new df onto train and test dfs
m = encoder.transform(h1n1_train[['education']]).todense()
h1n1_train = pd.concat([
    h1n1_train,
    pd.DataFrame(m, columns=cols, index=h1n1_train.index)
], axis=1)

m = encoder.transform(h1n1_test[['education']]).todense()
h1n1_test = pd.concat([
    h1n1_test,
    pd.DataFrame(m, columns=cols, index=h1n1_test.index)
], axis=1)

In [None]:
# Encode race column

# Create encoder object
encoder = OneHotEncoder()

# Fit on the age_group column of the train df
encoder.fit(h1n1_train[['race']])

# nice columns for display
cols = ['race_' + c for c in encoder.categories_[0]]

# Transform the column on train and test and concatenate new df onto train and test dfs
m = encoder.transform(h1n1_train[['race']]).todense()
h1n1_train = pd.concat([
    h1n1_train,
    pd.DataFrame(m, columns=cols, index=h1n1_train.index)
], axis=1)

m = encoder.transform(h1n1_test[['race']]).todense()
h1n1_test = pd.concat([
    h1n1_test,
    pd.DataFrame(m, columns=cols, index=h1n1_test.index)
], axis=1)

In [None]:
# Encode income_poverty column

# Create encoder object
encoder = OneHotEncoder()

# Fit on the age_group column of the train df
encoder.fit(h1n1_train[['income_poverty']])

# nice columns for display
cols = ['income_poverty_' + c for c in encoder.categories_[0]]

# Transform the column on train and test and concatenate new df onto train and test dfs
m = encoder.transform(h1n1_train[['income_poverty']]).todense()
h1n1_train = pd.concat([
    h1n1_train,
    pd.DataFrame(m, columns=cols, index=h1n1_train.index)
], axis=1)

m = encoder.transform(h1n1_test[['income_poverty']]).todense()
h1n1_test = pd.concat([
    h1n1_test,
    pd.DataFrame(m, columns=cols, index=h1n1_test.index)
], axis=1)

In [None]:
h1n1_train

In [None]:
h1n1_test

__Now perform the same encoding on the seasonal_train and test dfs__

In [None]:
# Encode age_group column

# Create encoder object
encoder = OneHotEncoder()

# Fit on the age_group column of the train df
encoder.fit(seasonal_train[['age_group']])

# nice columns for display
cols = ['age_group_' + c for c in encoder.categories_[0]]

# Transform the column on train and test and concatenate new df onto train and test dfs
m = encoder.transform(seasonal_train[['age_group']]).todense()
seasonal_train = pd.concat([
    seasonal_train,
    pd.DataFrame(m, columns=cols, index=seasonal_train.index)
], axis=1)

m = encoder.transform(seasonal_test[['age_group']]).todense()
seasonal_test = pd.concat([
    seasonal_test,
    pd.DataFrame(m, columns=cols, index=seasonal_test.index)
], axis=1)

In [None]:
# Encode education column

# Create encoder object
encoder = OneHotEncoder()

# Fit on the age_group column of the train df
encoder.fit(seasonal_train[['education']])

# nice columns for display
cols = ['education_' + c for c in encoder.categories_[0]]

# Transform the column on train and test and concatenate new df onto train and test dfs
m = encoder.transform(seasonal_train[['education']]).todense()
seasonal_train = pd.concat([
    seasonal_train,
    pd.DataFrame(m, columns=cols, index=seasonal_train.index)
], axis=1)

m = encoder.transform(seasonal_test[['education']]).todense()
seasonal_test = pd.concat([
    seasonal_test,
    pd.DataFrame(m, columns=cols, index=seasonal_test.index)
], axis=1)

In [None]:
# Encode race column

# Create encoder object
encoder = OneHotEncoder()

# Fit on the age_group column of the train df
encoder.fit(seasonal_train[['race']])

# nice columns for display
cols = ['race_' + c for c in encoder.categories_[0]]

# Transform the column on train and test and concatenate new df onto train and test dfs
m = encoder.transform(seasonal_train[['race']]).todense()
seasonal_train = pd.concat([
    seasonal_train,
    pd.DataFrame(m, columns=cols, index=seasonal_train.index)
], axis=1)

m = encoder.transform(seasonal_test[['race']]).todense()
seasonal_test = pd.concat([
    seasonal_test,
    pd.DataFrame(m, columns=cols, index=seasonal_test.index)
], axis=1)

In [None]:
# Encode income_poverty column

# Create encoder object
encoder = OneHotEncoder()

# Fit on the age_group column of the train df
encoder.fit(seasonal_train[['income_poverty']])

# nice columns for display
cols = ['income_poverty_' + c for c in encoder.categories_[0]]

# Transform the column on train and test and concatenate new df onto train and test dfs
m = encoder.transform(seasonal_train[['income_poverty']]).todense()
seasonal_train = pd.concat([
    seasonal_train,
    pd.DataFrame(m, columns=cols, index=seasonal_train.index)
], axis=1)

m = encoder.transform(seasonal_test[['income_poverty']]).todense()
seasonal_test = pd.concat([
    seasonal_test,
    pd.DataFrame(m, columns=cols, index=seasonal_test.index)
], axis=1)

Turn these into functions and add to prepare.py. I'll just do one example here to test if it works then comment it out to make sure I don't have duplicate columns moving forward. The other functions will only be added to the .py file.

In [None]:
def ohe_income_poverty(train,test):
    # Encode income_poverty column

    # Create encoder object
    encoder = OneHotEncoder()

    # Fit on the age_group column of the train df
    encoder.fit(train[['income_poverty']])

    # nice columns for display
    cols = ['income_poverty_' + c for c in encoder.categories_[0]]

    # Transform the column on train and test and concatenate new df onto train and test dfs
    m = encoder.transform(train[['income_poverty']]).todense()
    train = pd.concat([
        train,
        pd.DataFrame(m, columns=cols, index=train.index)
    ], axis=1)

    m = encoder.transform(test[['income_poverty']]).todense()
    test = pd.concat([
        test,
        pd.DataFrame(m, columns=cols, index=test.index)
    ], axis=1)

    return train, test

### Scaling

- The following columns are either opions on a scale of 1-5 or a count of number of children or adults in a household:
  - h1n1_concern
  - h1n1_knowledge
  - opinion_h1n1_vacc_effective
  - opinion_h1n1_risk
  - opinion_h1n1_sick_from_vacc
  - opinion_seas_vacc_effective
  - opinion_seas_risk
  - opinion_seas_sick_from_vac
  - household_adults
  - household_children
- All other features are on a scale of 0-1. I will apply a MinMax Scaler to the above columns to get them also on a 0-1 scale to avoid weighting issues in the models to come. 

In [None]:
# Create a scaler object using SKlearn's MinMax Scaler
scaler = MinMaxScaler()

In [None]:
# Add scaled columns to train dataframe
h1n1_train[['h1n1_concern','h1n1_knowledge', 
       'opinion_h1n1_vacc_effective',
       'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc',
       'opinion_seas_vacc_effective',
       'opinion_seas_risk',
       'opinion_seas_sick_from_vacc',
       'household_adults',
       'household_children'
      ]] = scaler.fit_transform(
    h1n1_train[['h1n1_concern',
       'h1n1_knowledge', 
       'opinion_h1n1_vacc_effective',
       'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc',
       'opinion_seas_vacc_effective',
       'opinion_seas_risk',
       'opinion_seas_sick_from_vacc',
       'household_adults',
       'household_children']])

Check to make sure scaling worked appropriately

In [None]:
h1n1_train

Excellent! The MinMax Scaler was applied correctly. Now repeat this process for the test dataframe and turn these transformations into functions

In [None]:
# Add scaled columns to test dataframes
h1n1_test[['h1n1_concern','h1n1_knowledge', 
       'opinion_h1n1_vacc_effective',
       'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc',
       'opinion_seas_vacc_effective',
       'opinion_seas_risk',
       'opinion_seas_sick_from_vacc',
       'household_adults',
       'household_children'
      ]] = scaler.fit_transform(
    h1n1_test[['h1n1_concern',
       'h1n1_knowledge', 
       'opinion_h1n1_vacc_effective',
       'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc',
       'opinion_seas_vacc_effective',
       'opinion_seas_risk',
       'opinion_seas_sick_from_vacc',
       'household_adults',
       'household_children']])

In [None]:
# Add scaled columns to train dataframe
seasonal_train[['h1n1_concern','h1n1_knowledge', 
       'opinion_h1n1_vacc_effective',
       'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc',
       'opinion_seas_vacc_effective',
       'opinion_seas_risk',
       'opinion_seas_sick_from_vacc',
       'household_adults',
       'household_children'
      ]] = scaler.fit_transform(
   seasonal_train[['h1n1_concern',
       'h1n1_knowledge', 
       'opinion_h1n1_vacc_effective',
       'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc',
       'opinion_seas_vacc_effective',
       'opinion_seas_risk',
       'opinion_seas_sick_from_vacc',
       'household_adults',
       'household_children']])

In [None]:
# Add scaled columns to test dataframe
seasonal_test[['h1n1_concern','h1n1_knowledge', 
       'opinion_h1n1_vacc_effective',
       'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc',
       'opinion_seas_vacc_effective',
       'opinion_seas_risk',
       'opinion_seas_sick_from_vacc',
       'household_adults',
       'household_children'
      ]] = scaler.fit_transform(
   seasonal_test[['h1n1_concern',
       'h1n1_knowledge', 
       'opinion_h1n1_vacc_effective',
       'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc',
       'opinion_seas_vacc_effective',
       'opinion_seas_risk',
       'opinion_seas_sick_from_vacc',
       'household_adults',
       'household_children']])

Verify changes

In [None]:
h1n1_train.head(3)

In [None]:
h1n1_test.head(3)

In [None]:
seasonal_train.head(3)

In [None]:
seasonal_train.head(3)

Write a function to add scaled columns to train, test dataframes without modifying origional data

In [None]:
def minmax_scale(train, test, scale_column_list):
    '''
    Takes in train and test dataframes and a list of columns to be scaled.
    Uses the MinMaxScaler() from SKlearn and creates a dataframe of the scaled columns 
    with labeled column names.
    Joins the scaled dataframe to the train and test dataframes.
    Returns the transformed dataframes.
    '''

    # Create the scaler object
    scaler = MinMaxScaler()
    # Create labels for the scaled columns
    column_list_scaled = [col + '_scaled' for col in scale_column_list]
    # Apply the scaler to the columns provided to the list and then
    # pass in the labeled column list.
    train_scaled = pd.DataFrame(scaler.fit_transform(train[scale_column_list]), 
                                columns = column_list_scaled, 
                                index = train.index)
    train = train.join(train_scaled, rsuffix='_scaled')
    # Repeat the process for train dataframe
    test_scaled = pd.DataFrame(scaler.transform(test[scale_column_list]), 
                                columns = column_list_scaled, 
                                index = test.index)
    test = test.join(test_scaled, rsuffix='_scaled')

    return train, test