# Classification Prediction with Imbalanced Target

# Mary Donovan Martello

# Part 2: Test Different Subsets of Input Features for Optimizing Models

In [3]:
# import libraries

import pandas as pd
from pandas import read_csv
import numpy as np
from numpy import mean
from numpy import std
from numpy import argmax

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Markdown, display

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

#stop unnecessary warnings from printing to the screen
import warnings
warnings.simplefilter('ignore')



### This notebook uses two datasets that were cleaned and prepared in the 1_EDA_Prep notebook.  One of the datasets replaces some of the original variables with the Months_Late engineered features and the second dataset replaces some of the original variables with both the Months_Late and Payment_Ratio engineered features.

In [4]:
# Load data into a dataframe

df1 = pd.read_csv('logDefaultRev1.csv')
df2 = pd.read_csv('logDefaultRev2.csv')

In [5]:
df1.head()

Unnamed: 0,default,SEX,EDUCATION,MARRIAGE,AGE,LIMIT_BAL,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,MONTHS_LATE
0,1,1,1,2,3.401197,10.819798,12.157764,11.45188,12.121908,12.185186,11.599122,12.815479,0.0,7.901377,0.0,7.707962,7.09091,0.0,1.94591
1,1,1,2,2,3.401197,12.676079,12.318723,11.767754,12.28972,12.33983,11.875079,12.907014,7.824446,7.824446,0.0,8.47658,7.783641,7.378384,1.94591
2,1,1,1,1,3.988984,12.206078,12.623996,12.257488,12.610621,12.657464,12.339112,13.100394,8.748464,8.612685,8.612685,8.612685,8.517393,8.517393,1.94591
3,0,2,2,2,3.295837,11.002117,12.31163,11.735957,12.285069,12.346786,11.826144,12.886411,0.0,8.517393,7.828835,1.94591,8.006701,8.006701,1.94591
4,0,1,2,1,3.73767,12.542548,12.615709,12.246783,12.584776,12.658428,12.360976,13.100157,8.779711,0.0,9.564863,9.605822,0.0,8.517393,1.94591


In [6]:
df1.columns

Index(['default', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'LIMIT_BAL',
       'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5',
       'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
       'PAY_AMT6', 'MONTHS_LATE'],
      dtype='object')

In [8]:
df2.head()

Unnamed: 0,default,SEX,EDUCATION,MARRIAGE,AGE,LIMIT_BAL,MONTHS_LATE,PAYMENT_RATIO
0,1,1,1,2,3.401197,10.819798,1.94591,0.03811
1,1,1,2,2,3.401197,12.676079,1.94591,0.037397
2,1,1,1,1,3.988984,12.206078,1.94591,0.037338
3,0,2,2,2,3.295837,11.002117,1.94591,0.038758
4,0,1,2,1,3.73767,12.542548,1.94591,0.046396


**Feature Dataframes for Testing Feature Subsets**

In [9]:
#df1

# use get_dummies method for encoding categorical variables only to create subsets

# convert categorical data to numbers 
#get the categorical data
cat_features = ['SEX', 'EDUCATION', 'MARRIAGE']
df_cat1 = df1[cat_features]

# create dummy variable dataframe for categorical values 
dfDumm1 = pd.get_dummies(df_cat1)

# create a whole features dataset that can be used for train and validation data splitting
# combine the numerical features and the dummie features together
dfNum1 = df1.drop(['SEX', 'EDUCATION', 'MARRIAGE', 'default'], axis = 1)
X1 = pd.concat([dfNum1, dfDumm1], axis=1, ignore_index=True)
# create a whole target dataset that can be used for train and validation data splitting
y1 =  df1['default']


In [18]:
dfNum1.columns

Index(['AGE', 'LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
       'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'MONTHS_LATE'],
      dtype='object')

In [19]:
dfDumm1.columns

Index(['SEX', 'EDUCATION', 'MARRIAGE'], dtype='object')

In [20]:
colNames = ['AGE', 'LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5',
       'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'MONTHS_LATE',
           'SEX', 'EDUCATION', 'MARRIAGE']

In [21]:
X1.columns = colNames

In [10]:
#df2

# use get_dummies method for encoding categorical variables only to create subsets

# convert categorical data to numbers 
#get the categorical data
cat_features = ['SEX', 'EDUCATION', 'MARRIAGE']
df_cat2 = df2[cat_features]

# create dummy variable dataframe for categorical values 
dfDumm2 = pd.get_dummies(df_cat2)

# create a whole features dataset that can be used for train and validation data splitting
# combine the numerical features and the dummie features together
dfNum2 = df2.drop(['SEX', 'EDUCATION', 'MARRIAGE', 'default'], axis = 1)
X2 = pd.concat([dfNum2, dfDumm2], axis=1, ignore_index=True)
# create a whole target dataset that can be used for train and validation data splitting
y2 =  df2['default']

In [23]:
dfNum2.columns

Index(['AGE', 'LIMIT_BAL', 'MONTHS_LATE', 'PAYMENT_RATIO'], dtype='object')

In [24]:
colNames2 = ['AGE', 'LIMIT_BAL', 'MONTHS_LATE', 'PAYMENT_RATIO', 'SEX', 'EDUCATION', 'MARRIAGE']

In [25]:
X2.columns = colNames2

### Test different subsets of input features.

**df1 for the next 9 cells.**

In [28]:
# baseline of all variables
fullset = X1.loc[:, ['AGE', 'LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
       'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'MONTHS_LATE', 'SEX', 'EDUCATION',
       'MARRIAGE']]

# separate data into training and validation 
FTrain, FTest, yTrain_F, yTest_F = train_test_split(fullset, y1, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(FTrain)
# scale the training dataset
FTrain = scaler.transform(FTrain)
# scale the test dataset
FTest = scaler.transform(FTest)

# Instantiate the logistic regression model using default parameters
modelLRF = LogisticRegression()

# Fit the model with training data
modelLRF.fit(FTrain, yTrain_F)

# predict on test set
yhatF = modelLRF.predict(FTest)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_F, yhatF)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 80.14


In [40]:
# create a subset of the df that can be used for model evalutation
# lR coefficients top 5 important features
subset1 = X1.loc[:, [
 'MONTHS_LATE',
 'LIMIT_BAL',
 'MARRIAGE'
 ]]

# separate data into training and validation 
S1Train, S1Test, yTrain_S1, yTest_S1 = train_test_split(subset1, y1, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(S1Train)
# scale the training dataset
S1Train = scaler.transform(S1Train)
# scale the test dataset
S1Test = scaler.transform(S1Test)

# Instantiate the logistic regression model using default parameters
modelLRS1 = LogisticRegression()

# Fit the model with training data
modelLRS1.fit(S1Train, yTrain_S1)

# predict on test set
yhatS1 = modelLRS1.predict(S1Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S1, yhatS1)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 80.18


In [41]:
# create a subset of the df that can be used for model evalutation
# LR coefficients top 2 important features
subset2 = X1.loc[:, [
 'MONTHS_LATE',
 'LIMIT_BAL'
    
 ]]

# separate data into training and validation 
S2Train, S2Test, yTrain_S2, yTest_S2 = train_test_split(subset2, y1, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(S2Train)
# scale the training dataset
S2Train = scaler.transform(S2Train)
# scale the test dataset
S2Test = scaler.transform(S2Test)

# Instantiate the logistic regression model using default parameters
modelLRS2 = LogisticRegression()

# Fit the model with training data
modelLRS2.fit(S2Train, yTrain_S2)

# predict on test set
yhatS2 = modelLRS2.predict(S2Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S2, yhatS2)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 80.31


In [42]:
# create a subset of the df that can be used for model evalutation
# CART DecisionTreeClassifier 
# 3 of top 4 important features
subset3 = X1.loc[:, [
 'MONTHS_LATE',
    'AGE', 'LIMIT_BAL'
 
    
 ]]

# separate data into training and validation 
S3Train, S3Test, yTrain_S3, yTest_S3 = train_test_split(subset3, y1, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(S3Train)
# scale the training dataset
S3Train = scaler.transform(S3Train)
# scale the test dataset
S3Test = scaler.transform(S3Test)

# Instantiate the logistic regression model using default parameters
modelLRS3 = LogisticRegression()

# Fit the model with training data
modelLRS3.fit(S3Train, yTrain_S3)

# predict on test set
yhatS3 = modelLRS3.predict(S3Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S3, yhatS3)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 80.21


In [43]:
# create a subset of the df that can be used for model evalutation
# RandomForestClassifier
# TOP 4 important features
subset4 = X1.loc[:, [
 'MONTHS_LATE',
    'AGE', 'BILL_AMT1', 'LIMIT_BAL', 
    
 ]]

# separate data into training and validation 
S4Train, S4Test, yTrain_S4, yTest_S4 = train_test_split(subset4, y1, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(S4Train)
# scale the training dataset
S4Train = scaler.transform(S4Train)
# scale the test dataset
S4Test = scaler.transform(S4Test)

# Instantiate the logistic regression model using default parameters
modelLRS4 = LogisticRegression()

# Fit the model with training data
modelLRS4.fit(S4Train, yTrain_S4)

# predict on test set
yhatS4 = modelLRS4.predict(S4Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S4, yhatS4)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 80.21


In [44]:
# create a subset of the df that can be used for model evalutation
# Ensemble RandomForestClassifier
# TOP 4 important features for Default classifier
subset5 = X1.loc[:, [
 'MONTHS_LATE', 'BILL_AMT2', 'LIMIT_BAL',
    'PAY_AMT1',  'BILL_AMT1', 'PAY_AMT2',
    'BILL_AMT6', 'BILL_AMT3'
 ]]

# separate data into training and validation 
S5Train, S5Test, yTrain_S5, yTest_S5 = train_test_split(subset5, y1, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(S5Train)
# scale the training dataset
S5Train = scaler.transform(S5Train)
# scale the test dataset
S5Test = scaler.transform(S5Test)

# Instantiate the logistic regression model using default parameters
modelLRS5 = LogisticRegression()

# Fit the model with training data
modelLRS5.fit(S5Train, yTrain_S5)

# predict on test set
yhatS5 = modelLRS5.predict(S5Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S5, yhatS5)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 80.12


In [45]:
# create a subset of the df that can be used for model evalutation
# F statisic -TOP 8
subset6 = X1.loc[:, [
 'MONTHS_LATE', 'LIMIT_BAL',
    'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'
 ]]

# separate data into training and validation 
S6Train, S6Test, yTrain_S6, yTest_S6 = train_test_split(subset6, y1, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(S6Train)
# scale the training dataset
S6Train = scaler.transform(S6Train)
# scale the test dataset
S6Test = scaler.transform(S6Test)

# Instantiate the logistic regression model using default parameters
modelLRS6 = LogisticRegression()

# Fit the model with training data
modelLRS6.fit(S6Train, yTrain_S6)

# predict on test set
yhatS6 = modelLRS6.predict(S6Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S6, yhatS6)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 79.99


In [49]:
# create a subset of the df that can be used for model evalutation

# XGBClassifier Scores
# Top 3 feature importance minus the Education and Marriage variables
subset7 = X1.loc[:, [
 'MONTHS_LATE', 'LIMIT_BAL', 'BILL_AMT1'
 ]]

# separate data into training and validation 
S7Train, S7Test, yTrain_S7, yTest_S7 = train_test_split(subset7, y1, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(S7Train)
# scale the training dataset
S7Train = scaler.transform(S7Train)
# scale the test dataset
S7Test = scaler.transform(S7Test)


# Instantiate the logistic regression model using default parameters
modelLRS7 = LogisticRegression()

# Fit the model with training data
modelLRS7.fit(S7Train, yTrain_S7)

# predict on test set
yhatS7 = modelLRS7.predict(S7Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S7, yhatS7)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 80.32


In [47]:
# create a subset of the df that can be used for model evalutation
# RFECV and MFA
subset8 = X1.loc[:, [
 'MONTHS_LATE' 
    
 ]]

# separate data into training and validation 
S8Train, S8Test, yTrain_S8, yTest_S8 = train_test_split(subset8, y1, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(S8Train)
# scale the training dataset
S8Train = scaler.transform(S8Train)
# scale the test dataset
S8Test = scaler.transform(S8Test)

# Instantiate the logistic regression model using default parameters
modelLRS8 = LogisticRegression()

# Fit the model with training data
modelLRS8.fit(S8Train, yTrain_S8)

# predict on test set
yhatS8 = modelLRS8.predict(S8Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S8, yhatS8)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 80.62


In [50]:
# export best subset df to csv
subset8.to_csv('subset8df1Rev.csv', index = False)

**df2 data for remainder of cells.**

In [60]:
X2.columns

Index(['AGE', 'LIMIT_BAL', 'MONTHS_LATE', 'PAYMENT_RATIO', 'SEX', 'EDUCATION',
       'MARRIAGE'],
      dtype='object')

In [51]:
# baseline of all variables
fullset = X2.loc[:, ['AGE', 'LIMIT_BAL', 'MONTHS_LATE', 'PAYMENT_RATIO', 'SEX', 'EDUCATION',
       'MARRIAGE']]

# separate data into training and validation 
FTrain, FTest, yTrain_F, yTest_F = train_test_split(fullset, y2, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(FTrain)
# scale the training dataset
FTrain = scaler.transform(FTrain)
# scale the test dataset
FTest = scaler.transform(FTest)

# Instantiate the logistic regression model using default parameters
modelLRF = LogisticRegression()

# Fit the model with training data
modelLRF.fit(FTrain, yTrain_F)

# predict on test set
yhatF = modelLRF.predict(FTest)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_F, yhatF)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 81.12


In [58]:
# create a subset of the df that can be used for model evalutation
# LR coefficients top 3 important features
subset21 = X2.loc[:, [
 'MONTHS_LATE',
 'LIMIT_BAL', 'MARRIAGE'
    
 ]]

# separate data into training and validation 
S2Train, S2Test, yTrain_S2, yTest_S2 = train_test_split(subset21, y2, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(S2Train)
# scale the training dataset
S2Train = scaler.transform(S2Train)
# scale the test dataset
S2Test = scaler.transform(S2Test)

# Instantiate the logistic regression model using default parameters
modelLRS2 = LogisticRegression()

# Fit the model with training data
modelLRS2.fit(S2Train, yTrain_S2)

# predict on test set
yhatS2 = modelLRS2.predict(S2Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S2, yhatS2)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 81.18


In [53]:
# create a subset of the df that can be used for model evalutation
# CART DecisionTreeClassifier AND Random Forest Classifier AND Ensemble RF
# TOP 4 important features
subset22 = X2.loc[:, [
 'PAYMENT_RATIO', 'MONTHS_LATE',
    'AGE', 'LIMIT_BAL'
 
    
 ]]

# separate data into training and validation 
S3Train, S3Test, yTrain_S3, yTest_S3 = train_test_split(subset22, y2, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(S3Train)
# scale the training dataset
S3Train = scaler.transform(S3Train)
# scale the test dataset
S3Test = scaler.transform(S3Test)

# Instantiate the logistic regression model using default parameters
modelLRS3 = LogisticRegression()

# Fit the model with training data
modelLRS3.fit(S3Train, yTrain_S3)

# predict on test set
yhatS3 = modelLRS3.predict(S3Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S3, yhatS3)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 81.27


In [54]:
# create a subset of the df that can be used for model evalutation

# XGBClassifier Scores
# Top 6 feature importance minus the Education and Marriage variables
subset23 = X2.loc[:, [
 'MONTHS_LATE', 'PAYMENT_RATIO', 'LIMIT_BAL', 'AGE', 'EDUCATION', 'SEX',
       
 ]]

# separate data into training and validation 
S7Train, S7Test, yTrain_S7, yTest_S7 = train_test_split(subset23, y2, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(S7Train)
# scale the training dataset
S7Train = scaler.transform(S7Train)
# scale the test dataset
S7Test = scaler.transform(S7Test)


# Instantiate the logistic regression model using default parameters
modelLRS7 = LogisticRegression()

# Fit the model with training data
modelLRS7.fit(S7Train, yTrain_S7)

# predict on test set
yhatS7 = modelLRS7.predict(S7Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S7, yhatS7)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 81.23


In [55]:
# create a subset of the df that can be used for model evalutation
# F statisic -TOP 3
subset24 = X2.loc[:, [
 'MONTHS_LATE', 'LIMIT_BAL', 'PAYMENT_RATIO'
    
 ]]

# separate data into training and validation 
S6Train, S6Test, yTrain_S6, yTest_S6 = train_test_split(subset24, y2, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(S6Train)
# scale the training dataset
S6Train = scaler.transform(S6Train)
# scale the test dataset
S6Test = scaler.transform(S6Test)

# Instantiate the logistic regression model using default parameters
modelLRS6 = LogisticRegression()

# Fit the model with training data
modelLRS6.fit(S6Train, yTrain_S6)

# predict on test set
yhatS6 = modelLRS6.predict(S6Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S6, yhatS6)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 81.28


In [56]:
# create a subset of the df that can be used for model evalutation
# 
subset25 = X2.loc[:, [
 'PAYMENT_RATIO', 'MONTHS_LATE' 
    
 ]]

# separate data into training and validation 
S6Train, S6Test, yTrain_S6, yTest_S6 = train_test_split(subset25, y2, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(S6Train)
# scale the training dataset
S6Train = scaler.transform(S6Train)
# scale the test dataset
S6Test = scaler.transform(S6Test)

# Instantiate the logistic regression model using default parameters
modelLRS6 = LogisticRegression()

# Fit the model with training data
modelLRS6.fit(S6Train, yTrain_S6)

# predict on test set
yhatS6 = modelLRS6.predict(S6Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S6, yhatS6)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 81.52


In [60]:
# create a subset of the df that can be used for model evalutation
# Ensemble RandomForestClassifier
# TOP 4 important features for Default classifier
subset52 = X2.loc[:, [
 'MONTHS_LATE', 'PAYMENT_RATIO', 'LIMIT_BAL', 'AGE'
 ]]

# separate data into training and validation 
S5Train, S5Test, yTrain_S5, yTest_S5 = train_test_split(subset52, y2, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(S5Train)
# scale the training dataset
S5Train = scaler.transform(S5Train)
# scale the test dataset
S5Test = scaler.transform(S5Test)

# Instantiate the logistic regression model using default parameters
modelLRS5 = LogisticRegression()

# Fit the model with training data
modelLRS5.fit(S5Train, yTrain_S5)

# predict on test set
yhatS5 = modelLRS5.predict(S5Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S5, yhatS5)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 81.27


In [61]:
# create a subset of the df that can be used for model evalutation
# RFECV and MFA
subset82 = X2.loc[:, [
 'MONTHS_LATE' 
    
 ]]

# separate data into training and validation 
S8Train, S8Test, yTrain_S8, yTest_S8 = train_test_split(subset82, y2, test_size =0.3, random_state=11)

# define the scaler
scaler = StandardScaler()
# fit on the training dataset
scaler.fit(S8Train)
# scale the training dataset
S8Train = scaler.transform(S8Train)
# scale the test dataset
S8Test = scaler.transform(S8Test)

# Instantiate the logistic regression model using default parameters
modelLRS8 = LogisticRegression()

# Fit the model with training data
modelLRS8.fit(S8Train, yTrain_S8)

# predict on test set
yhatS8 = modelLRS8.predict(S8Test)

# evaluate the baseline with accuracy score
accuracy = accuracy_score(yTest_S8, yhatS8)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 81.53


In [62]:
# export best subset df to csv
subset25.to_csv('subset25df2Rev.csv', index = False)