In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for training data
train_df = pd.get_dummies(train_df, columns=['home_ownership','verification_status',
                                             'application_type','hardship_flag',
                                             'debt_settlement_flag','initial_list_status',
                                             'pymnt_plan'])

In [4]:
# Convert categorical data to numeric and separate target feature for testing data
test_df = pd.get_dummies(test_df, columns=['home_ownership','verification_status',
                                           'application_type','hardship_flag',
                                           'debt_settlement_flag','initial_list_status',
                                           'pymnt_plan'])

In [5]:
train_df['loan_status']=pd.get_dummies(train_df['loan_status'])
test_df['loan_status']=pd.get_dummies(test_df['loan_status'])

In [6]:
# add missing dummy variables to testing set
test_df['debt_settlement_flag_Y'] = '0'

In [7]:
y_train = train_df['loan_status']
X_train = train_df.drop('loan_status', axis=1)

In [8]:
y_test = test_df['loan_status']
X_test = test_df.drop('loan_status', axis=1)

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
unscaled_lr_classifier = LogisticRegression()
unscaled_lr_classifier.fit(X_train, y_train)

# Train a Random Forest Classifier model and print the model score
unscaled_rf_clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)


print(f"Training Data Score for unscaled LR: {unscaled_lr_classifier.score(X_train, y_train)}")
print(f"Testing Data Score for unscaled LR: {unscaled_lr_classifier.score(X_test, y_test)}")
print(f'Training Score for unscaled RF: {unscaled_rf_clf.score(X_train, y_train)}')
print(f'Testing Score for unscaled RF: {unscaled_rf_clf.score(X_test, y_test)}')



In [18]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Logistic Regression model on the scaled data and print the model score
scaled_lr_clf_2 = LogisticRegression().fit(X_train_scaled, y_train)

In [19]:
# Train the Logistic Regression model on the scaled data and print the model score
clf_2 = LogisticRegression().fit(X_train_scaled, y_train)
print(f'Training Score: {clf_2.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf_2.score(X_test_scaled, y_test)}')

In [20]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf2 = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf2.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf2.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.5942152275627393


In [21]:
sel = SelectFromModel(clf2)
sel.fit(X_train_scaled, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True,
                                                 class_weight=None,
                                                 criterion='gini',
                                                 max_depth=None,
                                                 max_features='auto',
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=500, n_jobs=None,
                                                 oob_score=False,
                                                 random_state=1, verbose=0,
     

In [25]:
X_selected_train = sel.transform(X_train)
X_selected_test = sel.transform(X_test)
scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

In [26]:
clf_3 = LogisticRegression()
clf_3.fit(X_selected_train_scaled, y_train)
print(f'Training Score: {clf_3.score(X_selected_train_scaled, y_train)}')
print(f'Testing Score: {clf_3.score(X_selected_test_scaled, y_test)}')



Training Score: 0.7066502463054187
Testing Score: 0.7218205019140791


In [27]:
clf_4 = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_selected_train_scaled, y_train)
print(f'Training Score: {clf_4.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf_4.score(X_test_scaled, y_test)}')

NameError: name 'clf4' is not defined