In [None]:
# Predict the Loan Status using Logistic Regression in scikit-learn

# Import required classes from Azureml
from azureml.core import Workspace, Run
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Access the Workspace
ws = Workspace.from_config("./config")

In [None]:
# Get the context of the experiment run
new_run = Run.get_context()

In [None]:
# Load the data from the local files
df = pd.read_csv("./data/loan.csv")

In [None]:
# Select columns from the dataset
LoanPrep = df[["Married", "Education", "Self_Employed", "ApplicantIncome", "LoanAmount", "Loan_Amount_Term", "Credit_History",
             "Loan_Status"]]

In [None]:
# Clean Missing Data - Drop the columns with missing values
LoanPrep = LoanPrep.dropna()

In [None]:
# Create Dummy variables - Not required in designer
LoanPrep = pd.get_dummies(LoanPrep, drop_first=True)

In [None]:
# Create X and Y - Similar to "edit columns" in Train Module
Y = LoanPrep[['Loan_Status_Y']]
X = LoanPrep.drop(['Loan_Status_Y'], axis=1)

In [None]:
# Split Data - X and Y datasets are training and testing sets
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size = 0.3, random_state = 1234, stratify=Y)

In [None]:
# Build the Logistic Regression model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [None]:
# Fit the data to the LogisticRegression object - Train Model
lr.fit(X_train, Y_train)

In [None]:
# Predict the outcome using Test data - Score Model 
# Scored Label
Y_predict = lr.predict(X_test)

In [None]:
# Get the probability score - Scored Probabilities
Y_prob = lr.predict_proba(X_test)[:, 1]

In [None]:
# Get Confusion matrix and the accuracy/score - Evaluate
from sklearn.metrics import confusion_matrix
cm    = confusion_matrix(Y_test, Y_predict)
score = lr.score(X_test, Y_test)

In [None]:
# Create the confusion matrix dictionary
cm_dict = {"schema_type": "confusion_matrix",
           "schema_version": "v1",
           "data": {"class_labels": ["N", "Y"],
                    "matrix": cm.tolist()}
           }

In [None]:
new_run.log("TotalObservations", len(df))
new_run.log_confusion_matrix("ConfusionMatrix", cm_dict)
new_run.log("Score", score)

In [None]:
# Create the Scored Dataset and upload to outputs

X_test = X_test.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

Y_prob_df    = pd.DataFrame(Y_prob, columns=["Scored Probabilities"]) 
Y_predict_df = pd.DataFrame(Y_predict, columns=["Scored Label"]) 

scored_dataset = pd.concat([X_test, Y_test, Y_predict_df, Y_prob_df],
                           axis=1)

In [None]:
# Upload the scored dataset
scored_dataset.to_csv("./outputs/loan_scored.csv", index=False)

In [None]:
# Complete the run
new_run.complete()