In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import mlflow as mlf

In [2]:
mlf.set_experiment("loan_status")

#mlf.set_tag("mlflow.runName","fourth-run")

In [3]:
df = pd.read_csv('data.csv')

In [4]:
df.isna().sum()

Loan_ID               0
Gender               10
Married               3
Dependents           14
Education             0
Self_Employed        20
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           16
Loan_Amount_Term      8
Credit_History       36
Property_Area         0
Loan_Status           0
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP002529,Male,Yes,2,Graduate,No,6700,1750.0,230.0,300.0,1.0,Semiurban,Y
1,LP001385,Male,No,0,Graduate,No,5316,0.0,136.0,360.0,1.0,Urban,Y
2,LP001926,Male,Yes,0,Graduate,No,3704,2000.0,120.0,360.0,1.0,Rural,Y
3,LP001144,Male,Yes,0,Graduate,No,5821,0.0,144.0,360.0,1.0,Urban,Y
5,LP001367,Male,Yes,1,Graduate,No,3052,1030.0,100.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
422,LP002449,Male,Yes,0,Graduate,No,2483,2466.0,90.0,180.0,0.0,Rural,Y
424,LP002519,Male,Yes,3+,Graduate,No,4691,0.0,100.0,360.0,1.0,Semiurban,Y
426,LP002531,Male,Yes,1,Graduate,Yes,16667,2250.0,86.0,360.0,1.0,Semiurban,Y
428,LP001940,Male,Yes,2,Graduate,No,3153,1560.0,134.0,360.0,1.0,Urban,Y


In [7]:
df['Gender'] = df['Gender'].map({'Male':0,'Female':1})
df['Married'] = df['Married'].map({'Yes':0,'No':1})
df['Loan_Status'] = df['Loan_Status'].map({'Y':0,'N':1})

In [8]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP002529,0,0,2,Graduate,No,6700,1750.0,230.0,300.0,1.0,Semiurban,0
1,LP001385,0,1,0,Graduate,No,5316,0.0,136.0,360.0,1.0,Urban,0
2,LP001926,0,0,0,Graduate,No,3704,2000.0,120.0,360.0,1.0,Rural,0
3,LP001144,0,0,0,Graduate,No,5821,0.0,144.0,360.0,1.0,Urban,0
5,LP001367,0,0,1,Graduate,No,3052,1030.0,100.0,360.0,1.0,Urban,0


In [9]:
df.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [10]:
features = ['Gender','Married','ApplicantIncome','LoanAmount','Credit_History']
X = df[features]
Y = df['Loan_Status']


mlf.log_param('features_colums', features)

['Gender', 'Married', 'ApplicantIncome', 'LoanAmount', 'Credit_History']

In [11]:
from sklearn.model_selection import train_test_split as tts 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score 


xtrain,xtest,ytrain,ytest = tts(X,Y,test_size=0.2,random_state=42)
max_depth = 6
model = RandomForestClassifier(max_depth=max_depth,random_state=42)

model.fit(xtrain,ytrain)

mlf.log_param('max_depth',max_depth)



6

In [12]:
pred = model.predict(xtest)
val_accuracy = accuracy_score(ytest,pred)
print(val_accuracy)
mlf.log_metric('val_accuracy_score',val_accuracy)


0.8507462686567164


In [13]:
train_pred = model.predict(xtrain)
accuracy = accuracy_score(ytrain,train_pred)
print(accuracy)
mlf.log_metric('train_accuracy_score',accuracy)


0.8507462686567164


In [14]:
mlf.end_run()

# Alternate way to have logging in MLflow

In [16]:
with mlf.start_run():
    max_depth = 6
    model = RandomForestClassifier(max_depth=max_depth,random_state=42)

    model.fit(xtrain,ytrain)

    mlf.log_param('max_depth',max_depth)

    from sklearn.metrics import accuracy_score 

    pred = model.predict(xtest)
    val_accuracy = accuracy_score(ytest,pred)
    print(val_accuracy)
    mlf.log_metric('val_accuracy_score',val_accuracy)

    train_pred = model.predict(xtrain)
    accuracy = accuracy_score(ytrain,train_pred)
    print(accuracy)
    mlf.log_metric('train_accuracy_score',accuracy)

    mlf.sklearn.log_model(model,"randomforest-model")

    


    


0.8507462686567164
0.8507462686567164


In [17]:
def mlflow_run(n_estimators,max_depth,i):
    with mlf.start_run():
        model_rf = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,random_state=5)

        model_rf.fit(xtrain,ytrain)

        pred_val = model.predict(xtest)
        val_accuracy = accuracy_score(ytest,pred_val)

        train_val = model.predict(xtrain)
        train_accuracy = accuracy_score(ytrain,train_val)

        run = "hyperparam_run"+str(i)
        mlf.set_tag('mlflow.runName',run)

        mlf.log_param('n_estimator',n_estimators)
        mlf.log_param('max_depth',max_depth)
        mlf.log_metric('val_acc',val_accuracy)
        mlf.log_metric('train_acc',train_accuracy)

        mlf.sklearn.log_model(model_rf,"model")



In [18]:
mlflow_run(10,2,1)

