Logistic Regression model deployment

Decision Trees

Import our dataset

In [2]:
from azureml.core import Workspace

# Connect to your Azure ML workspace
workspace = Workspace.from_config()

In [3]:
from azureml.core import Dataset

# Retrieve the data asset by its name (e.g., 'my_data_asset')
dataset = Dataset.get_by_name(workspace, name='Loan_dataset')

In [None]:
# Convert the dataset to a pandas DataFrame (if it is a tabular dataset)
df = dataset.to_pandas_dataframe()

# Display the first few rows of the DataFrame
print(df.head())

In [None]:
import pandas as pd


In [6]:
df.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,False,0,Graduate,False,5849,0,,360.0,1.0,Urban,True
1,LP001003,Male,True,1,Graduate,False,4583,1508,128.0,360.0,1.0,Rural,False
2,LP001005,Male,True,0,Graduate,True,3000,0,66.0,360.0,1.0,Urban,True
3,LP001006,Male,True,0,Not Graduate,False,2583,2358,120.0,360.0,1.0,Urban,True
4,LP001008,Male,False,0,Graduate,False,6000,0,141.0,360.0,1.0,Urban,True
5,LP001011,Male,True,2,Graduate,True,5417,4196,267.0,360.0,1.0,Urban,True
6,LP001013,Male,True,0,Not Graduate,False,2333,1516,95.0,360.0,1.0,Urban,True
7,LP001014,Male,True,3+,Graduate,False,3036,2504,158.0,360.0,0.0,Semiurban,False
8,LP001018,Male,True,2,Graduate,False,4006,1526,168.0,360.0,1.0,Urban,True
9,LP001020,Male,True,1,Graduate,False,12841,10968,349.0,360.0,1.0,Semiurban,False


Exploring our Data

In [7]:
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [8]:
df.info() #We have a lot of object data types which we will have to check and deal with later 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    int64  
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    bool   
dtypes: bool(1), float64(3), int64(2), object(7)
memory usage: 58.3+ KB


Check for missing values

In [9]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [10]:
#check the shape of the data
df.shape # the datset is not big, having 614 row and 13 columns - let's find away to deal with the missing data without removing - mode, mean, median

(614, 13)

In [11]:
#Before we start working on the missing data, lets see what variables have a strong correlation with our dependent variable - 'Loan_Status'
#If an indenpendent variable has little to no correlation with our target variable - we could possibly remove it and save us some time

#import seaborn as sns #importing seaborn to access a heatmap
#sns.heatmap(df.corr(),annot=True, cmap='RdYlGn') #The seaborn will only map the numerical variables - lets one hot encode out target variable to 0 for No and 1 for yes

In [12]:
df['Loan_Status'].value_counts()

Loan_Status
True     422
False    192
Name: count, dtype: int64

In [13]:
vmap = {False:0, True:1} 

df['Loan_Status'] = df['Loan_Status'].map(vmap) #Mapping 0 to 'No' and 1 to 'Y'

In [14]:
df['Loan_Status'].head()

0    1
1    0
2    1
3    1
4    1
Name: Loan_Status, dtype: int64

In [15]:
#Let's check Loan_Status dtype
df['Loan_Status'].dtype

dtype('int64')

In [16]:
#Now that Loan_Status is a integer, lets run seaborn again
#sns.heatmap(df.corr(),annot=True, cmap='RdYlGn')

Let's start dealing with the missing values in our dataset and check for outliers

In [17]:
#Gender is missing 13 values
df['Gender'].value_counts() 

# we can see that there is 4 to 1 split in favour of male applicants - so we can use mode, mean is not an option as we will be mapping the genders to 0 or 1

Gender
Male      489
Female    112
Name: count, dtype: int64

In [18]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])

In [19]:
df['Gender'].isnull().sum()

0

In [20]:
#Drop loan_ID as this will have no impact on our result
df.drop('Loan_ID',axis=1,inplace=True) # high cardinality

In [21]:
df.shape

(614, 12)

In [22]:
#Let's check the married Independent variable with 3 missing data entrys
df['Married'].value_counts()

Married
True     398
False    213
Name: count, dtype: int64

In [23]:
df['Married'].head(10)

0    False
1     True
2     True
3     True
4    False
5     True
6     True
7     True
8     True
9     True
Name: Married, dtype: object

In [24]:
#Let's drop NaN rows for Married
df.dropna(subset=['Married'],inplace=True)

In [25]:
df['Married'].isnull().sum() #Now we have no missing values

0

In [26]:
#Lets map 'No' to 0 and "Yes' to 1 for the aried indepdendent variable
mmap = {False:0, True:1} 

df['Married'] = df['Married'].map(mmap) #Mapping 0 to 'No' and 1 to 'Y'

In [27]:
df['Married'].head()

0    0
1    1
2    1
3    1
4    0
Name: Married, dtype: int64

In [28]:
#sns.heatmap(df.corr(),annot=True, cmap='RdYlGn') #lets run a heatmap coorelation again to check if we should keep married independent variable

In [29]:
#Okay whats next - lets check the total NaN values
df.isnull().sum()

Gender                0
Married               0
Dependents           12
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           21
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [30]:
df['Married'].value_counts()

Married
1    398
0    213
Name: count, dtype: int64

In [31]:
#Now on to Dependents variable
df['Dependents'].isnull().sum()

12

In [32]:
df['Dependents'].value_counts()

Dependents
0     345
1     102
2     101
3+     51
Name: count, dtype: int64

In [33]:
df['Dependents'].head(15) #print the first 5 rows - Note: this variable is an object data type

0      0
1      1
2      0
3      0
4      0
5      2
6      0
7     3+
8      2
9      1
10     2
11     2
12     2
13     0
14     2
Name: Dependents, dtype: object

In [34]:
#Before we convert our Dependents column - we must deal with the 3+ values
#if we have convert our 3+ now to an interger they will show as a NaN value and get mixed in with our current NaN values
#We will change our '3+' entries to 99
df['Dependents'] = df['Dependents'].replace('3+','99')

In [35]:
#Let's convert the column Dependents to an integer
#Converting an object column with NaN values to integers can be a bit tricky when using pandas - we use a nulaable integer type
#Numpy inetgers like int6464 do not support NaN values.
df['Dependents'] = pd.to_numeric(df['Dependents'], errors='coerce').astype('Int64')

In [36]:
#df['Dependents'] = df['Dependents'].astype(int)

#I have tried changing the data type to an int but 3+ will cause an issue
#I will try map the 3+ value to a random list between 4 and 8 assign these values
#23 can see the 99 which we changed in previous code
df['Dependents'].head(10)

0     0
1     1
2     0
3     0
4     0
5     2
6     0
7    99
8     2
9     1
Name: Dependents, dtype: Int64

In [37]:
df['Dependents'].isnull().sum() #after converting the Dependents column we have 60 NaN values which represent our 3+

12

In [38]:
#I want to fill the mean missing values with the mean but I first want to replace by 3+ values with a random number between 3 and 8

import random #importing random module to use for randoming choosing a number between 3 and 8

df['Dependents'] = df['Dependents'].apply(
    lambda x: (random.randint(3,9)) if not pd.isna(x) and x == 99 else x # if not pd.isna(x) -  checks is value is not a NaN and where x = 99
)

In [39]:
df['Dependents'].head(10) #We can see our 3+ entries have been changed to a random int between 3 and 8

0    0.0
1    1.0
2    0.0
3    0.0
4    0.0
5    2.0
6    0.0
7    3.0
8    2.0
9    1.0
Name: Dependents, dtype: float64

In [40]:
df['Dependents'] = pd.to_numeric(df['Dependents'], errors='coerce').astype('Int64') #for some reason the df['Dependents'] is still showing as dtype: object - let's change that

In [41]:
df['Dependents'].dtype

Int64Dtype()

In [42]:
df['Dependents'].head(10)

0    0
1    1
2    0
3    0
4    0
5    2
6    0
7    3
8    2
9    1
Name: Dependents, dtype: Int64

In [43]:
#fill the missing values with the mean
#df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mean())

#I get an error runing the code above so I will use the round method and change the data type
# Fill NaNs with the mean, rounded to nearest whole number
mean_val = round(df['Dependents'].mean())
df['Dependents'] = df['Dependents'].fillna(mean_val)

# Convert to int64
df['Dependents'] = df['Dependents'].astype('int64')

In [44]:
#check for Null values
df.isnull().sum()

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           21
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 611 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             611 non-null    object 
 1   Married            611 non-null    int64  
 2   Dependents         611 non-null    int64  
 3   Education          611 non-null    object 
 4   Self_Employed      579 non-null    object 
 5   ApplicantIncome    611 non-null    int64  
 6   CoapplicantIncome  611 non-null    int64  
 7   LoanAmount         590 non-null    float64
 8   Loan_Amount_Term   597 non-null    float64
 9   Credit_History     561 non-null    float64
 10  Property_Area      611 non-null    object 
 11  Loan_Status        611 non-null    int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 62.1+ KB


In [46]:
#Now we will work on the Self_employed 32 missing values
df['Self_Employed'].value_counts()

Self_Employed
False    497
True      82
Name: count, dtype: int64

In [47]:
semap = {False:0, True:1} 

df['Self_Employed'] = df['Self_Employed'].map(semap) #Mapping 0 to 'No' and 1 to 'Y'

In [48]:
#I will fill the missing values in the self_employedd column with the mode - as the majority of entries are 'No'
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])

df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,0,0,Graduate,0.0,5849,0,,360.0,1.0,Urban,1
1,Male,1,1,Graduate,0.0,4583,1508,128.0,360.0,1.0,Rural,0
2,Male,1,0,Graduate,1.0,3000,0,66.0,360.0,1.0,Urban,1
3,Male,1,0,Not Graduate,0.0,2583,2358,120.0,360.0,1.0,Urban,1
4,Male,0,0,Graduate,0.0,6000,0,141.0,360.0,1.0,Urban,1


In [49]:
df['Self_Employed'] = df['Self_Employed'].astype(int)

In [50]:
df['Self_Employed'].isnull().sum() #no missing values

0

In [51]:
df['Self_Employed'].head(10)

0    0
1    0
2    1
3    0
4    0
5    1
6    0
7    0
8    0
9    0
Name: Self_Employed, dtype: int64

In [52]:
#sns.heatmap(df.corr(),annot=True, cmap='RdYlGn')

In [53]:
#Checking missing values now in our dataset
df.isnull().sum()

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           21
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 611 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             611 non-null    object 
 1   Married            611 non-null    int64  
 2   Dependents         611 non-null    int64  
 3   Education          611 non-null    object 
 4   Self_Employed      611 non-null    int64  
 5   ApplicantIncome    611 non-null    int64  
 6   CoapplicantIncome  611 non-null    int64  
 7   LoanAmount         590 non-null    float64
 8   Loan_Amount_Term   597 non-null    float64
 9   Credit_History     561 non-null    float64
 10  Property_Area      611 non-null    object 
 11  Loan_Status        611 non-null    int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 62.1+ KB


In [55]:
df['Loan_Amount_Term'].value_counts()

Loan_Amount_Term
360.0    511
180.0     44
480.0     14
300.0     13
84.0       4
120.0      3
240.0      3
60.0       2
36.0       2
12.0       1
Name: count, dtype: int64

In [56]:
#Majority of entries are 360.0 so we will replace all NaN entires with 360.0
#I could use mean but we have only 14 missing values so I will replace the NaN entries with mode

df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(360.0).astype(float)

In [57]:
df['Loan_Amount_Term'].isnull().sum()

0

In [58]:
df['Loan_Amount_Term'].value_counts()

Loan_Amount_Term
360.0    525
180.0     44
480.0     14
300.0     13
84.0       4
120.0      3
240.0      3
60.0       2
36.0       2
12.0       1
Name: count, dtype: int64

In [59]:
df.isnull().sum()

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           21
Loan_Amount_Term      0
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [60]:
#Replacing 'LoanAmount'
df['LoanAmount'].value_counts()

LoanAmount
120.0    20
110.0    17
100.0    15
187.0    12
128.0    11
         ..
240.0     1
214.0     1
59.0      1
166.0     1
253.0     1
Name: count, Length: 203, dtype: int64

In [61]:
#Here I will replace the NaN with the mean
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mean())

df['LoanAmount'] = df['LoanAmount'].round() #rounding all numbers

In [62]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,0,0,Graduate,0,5849,0,146.0,360.0,1.0,Urban,1
1,Male,1,1,Graduate,0,4583,1508,128.0,360.0,1.0,Rural,0
2,Male,1,0,Graduate,1,3000,0,66.0,360.0,1.0,Urban,1
3,Male,1,0,Not Graduate,0,2583,2358,120.0,360.0,1.0,Urban,1
4,Male,0,0,Graduate,0,6000,0,141.0,360.0,1.0,Urban,1


In [63]:
#Credit history is important as we can see from our seaborn heatmap it has a strong correlation with Loan_Status our target variable
#sns.heatmap(df.corr(),annot=True, cmap='RdYlGn')

In [64]:
df['Credit_History'].value_counts()

Credit_History
1.0    472
0.0     89
Name: count, dtype: int64

In [65]:
#filling credit history with mode - 49 missing values
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])

In [66]:
df['Credit_History'].isnull().sum()

0

In [67]:
#sns.heatmap(df.corr(),annot=True, cmap='RdYlGn')

In [68]:
#Checking dataframe data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 611 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             611 non-null    object 
 1   Married            611 non-null    int64  
 2   Dependents         611 non-null    int64  
 3   Education          611 non-null    object 
 4   Self_Employed      611 non-null    int64  
 5   ApplicantIncome    611 non-null    int64  
 6   CoapplicantIncome  611 non-null    int64  
 7   LoanAmount         611 non-null    float64
 8   Loan_Amount_Term   611 non-null    float64
 9   Credit_History     611 non-null    float64
 10  Property_Area      611 non-null    object 
 11  Loan_Status        611 non-null    int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 62.1+ KB


In [69]:
#Mapping our object data to intgers 0 and 1 for 'Gender', 'Education' and 0,1,2 for'Property Area'
#Change data type of the column

In [70]:
#'Education'

df['Education'].value_counts()

Education
Graduate        477
Not Graduate    134
Name: count, dtype: int64

In [71]:
edmap = {'Not Graduate':0, 'Graduate':1} 

df['Education'] = df['Education'].map(edmap)

In [72]:
df['Education'].head()

0    1
1    1
2    1
3    0
4    1
Name: Education, dtype: int64

In [74]:
df['Education'].dtype

dtype('int64')

In [75]:
#'Gender'
df['Gender'].value_counts()

Gender
Male      500
Female    111
Name: count, dtype: int64

In [76]:
genmap = {'Female':0, 'Male':1} 

df['Gender'] = df['Gender'].map(genmap)

In [77]:
df['Gender'].dtype

dtype('int64')

In [78]:
df['Gender'].head()

0    1
1    1
2    1
3    1
4    1
Name: Gender, dtype: int64

In [79]:
#'Property Area'
df['Property_Area'].value_counts()

Property_Area
Semiurban    231
Urban        201
Rural        179
Name: count, dtype: int64

In [80]:
propmap = {'Semiurban':0, 'Urban':1,'Rural':2}

df['Property_Area'] = df['Property_Area'].map(propmap)

In [81]:
df['Property_Area'].head()

0    1
1    2
2    1
3    1
4    1
Name: Property_Area, dtype: int64

In [82]:
#Let's run our seaborn heatmap again and check if we can perform Principal Component Analysis and remove independent variables not required
# import matplotlib.pyplot as plt
# plt.figure(figsize=(14,10))
# sns.heatmap(df.corr(),annot=True, cmap='RdYlGn')

# plt.title('Correlation heatmap')
# plt.show()

In [83]:
#Property Area has a little to no correlation on our target variable - I will drop this column
df = df.drop('Property_Area',axis=1)

In [84]:
# plt.figure(figsize=(14,10))
# sns.heatmap(df.corr(),annot=True, cmap='RdYlGn')

# plt.title('Correlation heatmap')
# plt.show()

In [85]:
#Scaling our data - some loan amounts and Income might be a lot bigger than others in our dataset - which could lead to bias
#I will check the min and max range and decide if I should scale our data between 0 and 1

df.describe()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
count,611.0,611.0,611.0,611.0,611.0,611.0,611.0,611.0,611.0,611.0,611.0
mean,0.818331,0.651391,1.055646,0.780687,0.134206,5399.513912,1627.97054,146.356792,342.324059,0.854337,0.685761
std,0.385888,0.47692,1.866929,0.41412,0.341153,6120.764249,2931.748366,84.240777,64.208419,0.353057,0.464592
min,0.0,0.0,0.0,0.0,0.0,150.0,0.0,9.0,12.0,0.0,0.0
25%,1.0,0.0,0.0,1.0,0.0,2875.5,0.0,100.0,360.0,1.0,0.0
50%,1.0,1.0,0.0,1.0,0.0,3800.0,1213.0,128.0,360.0,1.0,1.0
75%,1.0,1.0,1.0,1.0,0.0,5790.0,2304.0,165.0,360.0,1.0,1.0
max,1.0,1.0,9.0,1.0,1.0,81000.0,41667.0,700.0,480.0,1.0,1.0


In [1]:
df.columns

NameError: name 'df' is not defined

In [86]:
#After review the data, we can see that 'Applicant Income', 'Loan_Term_amount' and 'CoapplicantIncome' hav a high standard deviation and could affect our models results
#Lets scale our data between 0 and 1
#we will use the MinMaxScaler class from sklearn module

In [87]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [88]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() #create an instance of MinMaxScaler class

#Split our data into X and Y variables - independent and dependent (target) variables 

X = df.drop('Loan_Status',axis=1) #dropping 'Loan_Status from X - independent varibales 
y = df['Loan_Status'] #Target Variable
#scale our data - will return a numpy array
df_scale = scaler.fit_transform(X)

#Convert back to a Dataframe
df_scaled = pd.DataFrame(df_scale,columns=X.columns)

In [89]:
df_scaled.head() #the data is now scaled between 0 and 1

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,1.0,0.0,0.0,1.0,0.0,0.070489,0.0,0.198263,0.74359,1.0
1,1.0,1.0,0.111111,1.0,0.0,0.05483,0.036192,0.172214,0.74359,1.0
2,1.0,1.0,0.0,1.0,1.0,0.03525,0.0,0.082489,0.74359,1.0
3,1.0,1.0,0.0,0.0,0.0,0.030093,0.056592,0.160637,0.74359,1.0
4,1.0,0.0,0.0,1.0,0.0,0.072356,0.0,0.191027,0.74359,1.0


In [None]:
df_scaled.describe()

In [138]:
#Lets confirm that we have no more object data types in our dataset  -- this should return 0
categor = [i for i in df_scaled.columns if df[i].dtype == 'O']

print(f'We have {len(categor)} in our dataset')

We have 0 in our dataset


In [139]:
#Checking if our Target variable is balanced - this shows the majority of Target values are 1
#Let's check class distribution
#419/611 = 68.6% for 1
#192/611 = 31.4 for 0
#Our dataset is not massively imbalanced so I will not add a class_weight hyperparamter to our model
df['Loan_Status'].value_counts()

Loan_Status
1    419
0    192
Name: count, dtype: int64

LOGISTIC REGRESSION (Sigmoid Function)

In [140]:
from sklearn.model_selection import train_test_split #train_test_split class is imported to split our data in train and test data

#We have a small datset so I will split the datasets 80/20
#As we have a small imbalanced dataset I will use the stratify parameter to avoid bias
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)

In [141]:
#Lets check the shape our train and test datasets
X_train.shape

(488, 10)

In [142]:
X_test.shape

(123, 10)

In [143]:
y_train.shape

(488,)

In [144]:
y_test.shape

(123,)

In [145]:
from sklearn.linear_model import LogisticRegression #importing our Logistic Regression model

In [146]:
model = LogisticRegression(C=1.0,penalty='l1',solver='liblinear') #creating an instance of class Logistic Regression and setting our hyperparameters

#C = 1.0: controls how much the model is penalized on large coefficents - C = 1.0 is the defult
#penalty = 'l1' : can shrink some weights to zero, some of our independent variables do not have a strong correlation with the target variable so this will help with feature selection
#solver='liblinear' : good for small datasets and support l1 (lasso)

In [147]:
#fit the model to our train data

model.fit(X_train,y_train)

In [159]:
#Now our model has been trained on our data I will use Pickle to export our trained model and import it again to run on our test data
#open a file, where you want to store the data

import pickle

with open('Loan_Data_Logistic_model.pkl','wb') as file:
    #dump information to the file
    pickle.dump(model,file)   

In [149]:
with open('Loan_Data_Logistic_model.pkl','rb') as file: #Loading our trained model that we dumped as a pkl file
    loaded_model = pickle.load(file)

In [150]:
loaded_model

In [151]:
y_pred = loaded_model.score(X_test,y_test) #Run our trained model on our test dataset

In [152]:
#checking how accurate our model was on our test data
from sklearn.metrics import accuracy_score 

y_pred_test = loaded_model.predict(X_test)

acc = accuracy_score(y_test, y_pred_test)
print(f'Accuracy: {acc:.2f}%')

Accuracy: 0.86%


In [153]:
#print the score on training and test set

print(f'Training set score: {model.score(X_train,y_train):.2f}%') #accuracy of our model on our train data

Training set score: 0.80%


In [132]:
#Considering we have a very small dataset, I believe the model has done well and produced good reults.

In [154]:
#Lets try and use a model with a larger C value to decrease regularization and check our models accuracy

model100 = LogisticRegression(C=100,penalty='l1',solver='liblinear')

In [155]:
train_pred = model100.fit(X_train,y_train)

In [156]:
print(f'Training set score: {train_pred.score(X_train,y_train):.2f}%') #accuracy of our model on our train data

Training set score: 0.80%


In [157]:
#Nothing has changed we still got 80%

In [158]:
#Visualize our data accuracy with a classifcation report 

from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       1.00      0.56      0.72        39
           1       0.83      1.00      0.91        84

    accuracy                           0.86       123
   macro avg       0.92      0.78      0.81       123
weighted avg       0.89      0.86      0.85       123



In [139]:
#for class 0Class 0 (minority class?)
# Precision: 1.00 → Perfect when it predicted class 0
# Recall: 0.56 → Only 56% of actual class 0s were correctly found.
# F1: 0.72 → A bit of imbalance; model is cautious about predicting class 0.

# Class 1
# Precision: 0.83 → Sometimes predicts class 1 when it's not.
# Recall: 1.00 → Caught all actual class 1 cases.
# F1: 0.91 → Strong overall performance on this class.

#Overall I am happy with the results of the model on our small dataset, we would a bigger dataset with less imbalance to increase the accuracy of this model

In [161]:
import joblib

joblib.dump(model,'LR_model.pkl')

['LR_model.pkl']

In [1]:
!pip show azure-ai-ml

Name: azure-ai-ml
Version: 1.27.1
Summary: Microsoft Azure Machine Learning Client Library for Python
Home-page: https://github.com/Azure/azure-sdk-for-python
Author: Microsoft Corporation
Author-email: azuresdkengsysadmins@microsoft.com
License: MIT License
Location: /anaconda/envs/azureml_py38/lib/python3.10/site-packages
Requires: azure-common, azure-core, azure-mgmt-core, azure-monitor-opentelemetry, azure-storage-blob, azure-storage-file-datalake, azure-storage-file-share, colorama, isodate, jsonschema, marshmallow, msrest, pydash, pyjwt, pyyaml, six, strictyaml, tqdm, typing-extensions
Required-by: 


In [2]:
!pip install --upgrade --quiet azure-ai-ml azure-identity


In [3]:
!pip install azure-ai-ml --upgrade --quiet

In [4]:
!pip install azure-identity --upgrade --quiet

In [10]:
from azure.ai.ml.entities import Model
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

credential = DefaultAzureCredential()
ml_client = MLClient.from_config(credential)

model = Model(
    path='LR_model.pkl',
    name='Logistc_Regression_Loan_Apprval',
    type=AssetTypes.CUSTOM_MODEL,
    description='Model for loan Aproval'
)

Found the config file in: /config.json
Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


In [11]:
registered_model = ml_client.models.create_or_update(model)
print(f'Model Registered: {registered_model} and {registered_model.version}')

[32mUploading LR_model.pkl[32m (< 1 MB): 0.00B [00:00, ?B/s][32mUploading LR_model.pkl[32m (< 1 MB): 100%|██████████| 1.35k/1.35k [00:00<00:00, 51.9kB/s]
[39m



Model Registered: creation_context:
  created_at: '2025-06-03T11:54:20.578694+00:00'
  created_by: marty ryan
  created_by_type: User
  last_modified_at: '2025-06-03T11:54:20.578694+00:00'
  last_modified_by: marty ryan
  last_modified_by_type: User
description: Model for loan Aproval
id: azureml:/subscriptions/da2833c7-0511-40ba-bdd9-bd09dcb6069d/resourceGroups/az-mjrml-project1/providers/Microsoft.MachineLearningServices/workspaces/az-mjrml/models/Logistc_Regression_Loan_Apprval/versions/1
name: Logistc_Regression_Loan_Apprval
path: azureml://subscriptions/da2833c7-0511-40ba-bdd9-bd09dcb6069d/resourceGroups/az-mjrml-project1/workspaces/az-mjrml/datastores/workspaceblobstore/paths/LocalUpload/655ce68663138e12ae085c1abfa0fb11/LR_model.pkl
properties: {}
stage: Development
tags: {}
type: custom_model
version: '1'
 and 1
