# Classification Prediction with Imbalanced Target

## Mary Donovan Martello

# Part 2: Test Different Subsets of Input Features for Optimizing Models

In [2]:
# import libraries

import pandas as pd
from pandas import read_csv
import numpy as np
from numpy import mean
from numpy import std
from numpy import argmax

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Markdown, display

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

#stop unnecessary warnings from printing to the screen
import warnings
warnings.simplefilter('ignore')



In [4]:
# Load data into a dataframe

df = pd.read_csv('pcaDefault.csv')


**Feature Dataframe for Probability Predictions and Classifier Threshold**

**Feature Dataframe for Testing Feature Subsets**

In [77]:
# use only to create subsets

# convert categorical data to numbers 
#get the categorical data
cat_features = ['SEX', 'EDUCATION', 'MARRIAGE']
df_cat = df[cat_features]

# create dummy variable dataframe for categorical values 
dfDumm = pd.get_dummies(df_cat)

# check the data
print(dfDumm.head(4))

# create a whole features dataset that can be used for train and validation data splitting
# combine the numerical features and the dummie features together
dfNum = df.drop(['SEX', 'EDUCATION', 'MARRIAGE', 'default'], axis = 1)
X = pd.concat([dfNum, dfDumm], axis=1)
# create a whole target dataset that can be used for train and validation data splitting
y =  df['default']


   SEX  EDUCATION  MARRIAGE
0    1          1         2
1    1          2         2
2    1          1         1
3    2          2         2


Check the input features for testing feature subsets.

In [8]:
X.columns

Index(['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'SEX',
       'EDUCATION', 'MARRIAGE'],
      dtype='object')

In [9]:
X.shape

(30000, 11)

**Test different subsets of input features**

In [11]:
# baseline of all variables
fullset = X.loc[:, [
 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'SEX',
       'EDUCATION', 'MARRIAGE']]

# separate data into training and validation 
FTrain, FTest, yTrain_F, yTest_F = train_test_split(fullset, y, test_size =0.3, random_state=11)

# Instantiate the logistic regression model using default parameters
modelLRF = LogisticRegression()

# Fit the model with training data
modelLRF.fit(FTrain, yTrain_F)

# predict on test set
yhatF = modelLRF.predict(FTest)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_F, yhatF)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 78.79


In [12]:
# create a subset of the df that can be used for model evalutation
# lR top 5 important features
subset1 = X.loc[:, [
 'PC7',
 'PC1',
 'PC8',
 'MARRIAGE'   
 'PC2',
 ]]

# separate data into training and validation 
S1Train, S1Test, yTrain_S1, yTest_S1 = train_test_split(subset1, y, test_size =0.3, random_state=11)

# Instantiate the logistic regression model using default parameters
modelLRS1 = LogisticRegression()

# Fit the model with training data
modelLRS1.fit(S1Train, yTrain_S1)

# predict on test set
yhatS1 = modelLRS1.predict(S1Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S1, yhatS1)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 78.90


In [21]:
# create a subset of the df that can be used for model evalutation
# LR top 6 important features
subset2 = X.loc[:, [
 'PC7',
 'PC1',
 'PC8',
 'MARRIAGE',   
 'PC2',
 'SEX',
    
 ]]

# separate data into training and validation 
S2Train, S2Test, yTrain_S2, yTest_S2 = train_test_split(subset2, y, test_size =0.3, random_state=11)

# Instantiate the logistic regression model using default parameters
modelLRS2 = LogisticRegression()

# Fit the model with training data
modelLRS2.fit(S2Train, yTrain_S2)

# predict on test set
yhatS2 = modelLRS2.predict(S2Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S2, yhatS2)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 78.83


In [30]:
# create a subset of the df that can be used for model evalutation
# CART DecisionTreeClassifier 
# TOP 6 important features
subset3 = X.loc[:, [
 'PC7',
 'PC1',
 'PC8',
    'PC3',
    'PC2',
    'PC4',
    
 ]]

# separate data into training and validation 
S3Train, S3Test, yTrain_S3, yTest_S3 = train_test_split(subset3, y, test_size =0.3, random_state=11)

# Instantiate the logistic regression model using default parameters
modelLRS3 = LogisticRegression()

# Fit the model with training data
modelLRS3.fit(S3Train, yTrain_S3)

# predict on test set
yhatS3 = modelLRS3.predict(S3Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S3, yhatS3)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 78.96


In [37]:
# create a subset of the df that can be used for model evalutation
# RandomForestClassifier
# TOP 5 important features
subset4 = X.loc[:, [
 'PC7',
 'PC1',
 'PC8',
  'PC6',
    'PC3',  
    
 ]]

# separate data into training and validation 
S4Train, S4Test, yTrain_S4, yTest_S4 = train_test_split(subset4, y, test_size =0.3, random_state=11)

# Instantiate the logistic regression model using default parameters
modelLRS4 = LogisticRegression()

# Fit the model with training data
modelLRS4.fit(S4Train, yTrain_S4)

# predict on test set
yhatS4 = modelLRS4.predict(S4Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S4, yhatS4)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 78.93


In [46]:
# create a subset of the df that can be used for model evalutation
# Ensemble RandomForestClassifier
# TOP 6 important features for Default classifier
subset5 = X.loc[:, [
 'PC7',
 'PC1',
 'PC8',
    'PC3',
    'PC2',
    'PC4', 
 ]]

# separate data into training and validation 
S5Train, S5Test, yTrain_S5, yTest_S5 = train_test_split(subset5, y, test_size =0.3, random_state=11)

# Instantiate the logistic regression model using default parameters
modelLRS5 = LogisticRegression()

# Fit the model with training data
modelLRS5.fit(S5Train, yTrain_S5)

# predict on test set
yhatS5 = modelLRS5.predict(S5Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S5, yhatS5)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 78.96


In [57]:
# create a subset of the df that can be used for model evalutation
# XGBClassifier F statisic -combo that produced the highest
subset6 = X.loc[:, [
 'PC7',
 'PC1',
 'PC8',
    'PC2',
    'PC4',
     'PC3'
 ]]

# separate data into training and validation 
S6Train, S6Test, yTrain_S6, yTest_S6 = train_test_split(subset6, y, test_size =0.3, random_state=11)

# Instantiate the logistic regression model using default parameters
modelLRS6 = LogisticRegression()

# Fit the model with training data
modelLRS6.fit(S6Train, yTrain_S6)

# predict on test set
yhatS6 = modelLRS6.predict(S6Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S6, yhatS6)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 78.96


In [11]:
# create a subset of the df that can be used for model evalutation

# XGBClassifier Scores
# Top 8 feature importance minus the Education and Marriage variables
subset7 = X.loc[:, [
 'PC7',
 'PC1',
 'PC8',
    'PC2',
    'PC4',
     'PC3'
 ]]

# separate data into training and validation 
S7Train, S7Test, yTrain_S7, yTest_S7 = train_test_split(subset7, y, test_size =0.3, random_state=11)

# Instantiate the logistic regression model using default parameters
modelLRS7 = LogisticRegression()

# Fit the model with training data
modelLRS7.fit(S7Train, yTrain_S7)

# predict on test set
yhatS7 = modelLRS7.predict(S7Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S7, yhatS7)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 78.96


In [67]:
# create a subset of the df that can be used for model evalutation
# f_classif(X, y)
subset8 = X.loc[:, [
 'PC7',
 'PC1',
 'PC8',
    'PC2', 'SEX', 
    
 ]]

# separate data into training and validation 
S8Train, S8Test, yTrain_S8, yTest_S8 = train_test_split(subset8, y, test_size =0.3, random_state=11)

# Instantiate the logistic regression model using default parameters
modelLRS8 = LogisticRegression()

# Fit the model with training data
modelLRS8.fit(S8Train, yTrain_S8)

# predict on test set
yhatS8 = modelLRS8.predict(S8Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S8, yhatS8)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 78.84


In [235]:
# export best subset df to csv
subset7.to_csv('subset7df.csv', index = False)