# Part 1: Loading the dataset and libraries

In [1234]:
import pandas as pd

In [1235]:
data = pd.read_excel("loan.xlsx")

# Part 2: Data understanding

In [1236]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Rows and cols

In [1237]:
# heesanaa: 614 rows and 13 columns
data.shape

(614, 13)

## deleting uneeded variables

In [1238]:
# tableka kasaar: load_Id
data.drop(['Loan_ID'], axis = 1, inplace = True)

In [1239]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Part 3: Solving Missing Values

In [1240]:
data.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [1241]:
# columska missingka kujiraa oona ah object(categorical)
missing_object_columns = data.select_dtypes(include='object').columns[data.select_dtypes(include='object').isnull().any()]
print("Objects columns with missing values: ",missing_object_columns)




# columska missingka kujiraa oona ah int64 iyo float64(numerical)
missing_numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns[data.select_dtypes(include=['int64', 'float64']).isnull().any()]
print("Numerical columns with missing Values are : ", missing_numeric_columns)



Objects columns with missing values:  Index(['Gender', 'Married', 'Dependents', 'Self_Employed'], dtype='object')
Numerical columns with missing Values are :  Index(['LoanAmount', 'Loan_Amount_Term', 'Credit_History'], dtype='object')


In [1242]:
## missing valueska numberka ah kubuuxi meankooda.
mean_loan_amount = data['LoanAmount'].mean()
mean_credit_history = data['Credit_History'].mean()
mean_loan_amount_term = data['Loan_Amount_Term'].mean()

## missing valueska text aha maa numerical ka  ah, kubuuxi meankkooda.
data['LoanAmount'].fillna(mean_loan_amount, inplace = True)
data['Credit_History'].fillna(mean_credit_history, inplace = True)
data['Loan_Amount_Term'].fillna(mean_loan_amount_term, inplace = True)


mode_gender = data['Gender'].mode()[0]
mode_married = data['Married'].mode()[0]
mode_dependents = data['Dependents'].mode()[0]
mean_self_employed = data['Self_Employed'].mode()[0]

## missing valueska text aha maa categoricalka ah, kubuuxi modekooda.
data['Gender'].fillna(mode_gender, inplace = True)
data['Married'].fillna(mode_married, inplace = True)
data['Dependents'].fillna(mode_dependents, inplace = True)
data['Self_Employed'].fillna(mean_self_employed, inplace = True)





In [1243]:
# there is no missing values on our dataset
data.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

# Part 5: Feature Engineering: Adding New Features

In [1244]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [1245]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [1246]:
import numpy as np


# TotalIncome: waa column oo kujira dhamaan daqliga soo galo
data['TotalIncome'] = data['ApplicantIncome'] + data['CoapplicantIncome']

#  LoanAmount_Log feature
# data['LoanAmount_Log'] = np.log(data['LoanAmount'])

#  LoanAmount_to_Income  waa column lagu oganayo qofkan deenta lagu leeyahay meeqo ayay u taganathy boqolkiiba.
# data['LoanAmount_to_Income'] = data['LoanAmount'] / data['TotalIncome']


# Display the updated dataset
print(data.head())

  Gender Married Dependents     Education Self_Employed  ApplicantIncome  \
0   Male      No          0      Graduate            No             5849   
1   Male     Yes          1      Graduate            No             4583   
2   Male     Yes          0      Graduate           Yes             3000   
3   Male     Yes          0  Not Graduate            No             2583   
4   Male      No          0      Graduate            No             6000   

   CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \
0                0.0  146.412162             360.0             1.0   
1             1508.0  128.000000             360.0             1.0   
2                0.0   66.000000             360.0             1.0   
3             2358.0  120.000000             360.0             1.0   
4                0.0  141.000000             360.0             1.0   

  Property_Area Loan_Status  TotalIncome  
0         Urban           Y       5849.0  
1         Rural           N       60

In [1247]:
# data.drop([['CoapplicantIncome', 'ApplicantIncome']], axis = 1)
data.drop(['CoapplicantIncome', 'ApplicantIncome', 'Property_Area'], axis=1, inplace=True)
# data.head()

In [1248]:
# data.drp[['ApplicantIncome', 'CoapplicantIncome', '']]

In [1249]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,TotalIncome
0,Male,No,0,Graduate,No,146.412162,360.0,1.0,Y,5849.0
1,Male,Yes,1,Graduate,No,128.0,360.0,1.0,N,6091.0
2,Male,Yes,0,Graduate,Yes,66.0,360.0,1.0,Y,3000.0
3,Male,Yes,0,Not Graduate,No,120.0,360.0,1.0,Y,4941.0
4,Male,No,0,Graduate,No,141.0,360.0,1.0,Y,6000.0


In [1250]:
data.shape

(614, 10)

# Part 6: Data Visualization

# Part 7: Encding the Categorical Varaibles

In [1251]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,TotalIncome
0,Male,No,0,Graduate,No,146.412162,360.0,1.0,Y,5849.0
1,Male,Yes,1,Graduate,No,128.0,360.0,1.0,N,6091.0
2,Male,Yes,0,Graduate,Yes,66.0,360.0,1.0,Y,3000.0
3,Male,Yes,0,Not Graduate,No,120.0,360.0,1.0,Y,4941.0
4,Male,No,0,Graduate,No,141.0,360.0,1.0,Y,6000.0


In [1252]:
data.Gender.value_counts()

Male      502
Female    112
Name: Gender, dtype: int64

In [1253]:
data['Gender'] = data['Gender'].map({'Male': 1, 'Female': 0})
data['Married'] = data['Married'].map({'Yes': 1, 'No': 0})
data['Education'] = data['Education'].map({'Graduate': 1, 'Not Graduate': 0})
data['Self_Employed'] = data['Self_Employed'].map({'Yes': 1, 'No': 0})
data['Loan_Status'] = data['Loan_Status'].map({'Y': 1, 'N': 0})
# data['Property_Area'] = data['Property_Area'].map({'Semiurban': 0, 'Urban': 1, 'Rural' : 2})
# data['Dependents'] = data['Dependents'].map({'0': 0, '1': 1, '2' : 2, '3+': 3})
data['Dependents'] = data['Dependents'].replace({'3+': 3}).astype(int)



In [1254]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,TotalIncome
0,1,0,0,1,0,146.412162,360.0,1.0,1,5849.0
1,1,1,1,1,0,128.0,360.0,1.0,0,6091.0
2,1,1,0,1,1,66.0,360.0,1.0,1,3000.0
3,1,1,0,0,0,120.0,360.0,1.0,1,4941.0
4,1,0,0,1,0,141.0,360.0,1.0,1,6000.0


In [1255]:
data.dtypes

Gender                int64
Married               int64
Dependents            int64
Education             int64
Self_Employed         int64
LoanAmount          float64
Loan_Amount_Term    float64
Credit_History      float64
Loan_Status           int64
TotalIncome         float64
dtype: object

In [1256]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,TotalIncome
0,1,0,0,1,0,146.412162,360.0,1.0,1,5849.0
1,1,1,1,1,0,128.0,360.0,1.0,0,6091.0
2,1,1,0,1,1,66.0,360.0,1.0,1,3000.0
3,1,1,0,0,0,120.0,360.0,1.0,1,4941.0
4,1,0,0,1,0,141.0,360.0,1.0,1,6000.0


# Part 7: splitting Datasets into train and test

In [1257]:
from sklearn.model_selection import train_test_split

# Load the dataset into a DataFrame or numpy array
# Replace 'X' and 'y' with your actual data and target variable
X = data.drop('Loan_Status', axis=1)
y = data['Loan_Status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#

In [1258]:
# X_train

##  Normalizing train and test

In [1259]:
from sklearn.preprocessing import StandardScaler

#  0 1 ha u dhaxeeyao traina nd test
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Perform normalization on the testing set
X_test= scaler.transform(X_test)


In [1260]:
X_train

array([[ 0.48339253,  0.75744171, -0.72327513, ...,  0.29449206,
        -0.00685711,  0.16730205],
       [ 0.48339253,  0.75744171, -0.72327513, ...,  0.29449206,
         0.44784556, -0.20640167],
       [ 0.48339253,  0.75744171,  1.26068067, ...,  0.29449206,
         0.44784556,  0.1196517 ],
       ...,
       [-2.06871213, -1.3202336 , -0.72327513, ...,  0.29449206,
         0.44784556, -0.61588221],
       [-2.06871213,  0.75744171, -0.72327513, ..., -1.50172657,
         0.44784556,  0.44804853],
       [ 0.48339253,  0.75744171, -0.72327513, ...,  0.29449206,
         0.44784556,  1.01094757]])

# Part 8: Building Models: using different models

## Model 1: Logistic Regression

In [1261]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Create a logistic regression model
lr_Model = LogisticRegression()

# Train the model using the training data
lr_Model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr_Model.predict(X_test)

# Evaluate the model
lr_accuracy = accuracy_score(y_test, y_pred)
lr_f1 = f1_score(y_test, y_pred)
lr_precision = precision_score(y_test, y_pred)
lr_recall = recall_score(y_test, y_pred)




results = pd.DataFrame([['Logistic Regression', lr_accuracy ,lr_f1, lr_precision, lr_recall]],
                       columns=['Model', 'Accuracy', 'F1', 'Precision', 'Recall'])


results

Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,Logistic Regression,0.780488,0.852459,0.757282,0.975


## Model 2: Decision Trees

In [1262]:
from sklearn.tree import DecisionTreeClassifier

# Create a decision tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Train the model using the training data
dt_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model
dt_accuracy = accuracy_score(y_test, y_pred_dt)
dt_f1 = f1_score(y_test, y_pred_dt)
dt_precision = precision_score(y_test, y_pred_dt)
dt_recall = recall_score(y_test, y_pred_dt)

# Add the decision tree model results to the DataFrame
results = results.append(pd.DataFrame([['Decision Tree', dt_accuracy, dt_f1, dt_precision, dt_recall]],
                                      columns=['Model', 'Accuracy', 'F1', 'Precision', 'Recall']),
                         ignore_index=True)

# Print the updated results DataFrame
results

  results = results.append(pd.DataFrame([['Decision Tree', dt_accuracy, dt_f1, dt_precision, dt_recall]],


Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,Logistic Regression,0.780488,0.852459,0.757282,0.975
1,Decision Tree,0.699187,0.764331,0.779221,0.75


## Model 3: Random Forests

In [1263]:
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Train the model using the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)

# Add the Random Forest model results to the DataFrame
results = results.append(pd.DataFrame([['Random Forest', rf_accuracy, rf_f1, rf_precision, rf_recall]],
                                      columns=['Model', 'Accuracy', 'F1', 'Precision', 'Recall']),
                         ignore_index=True)

# Print the updated results DataFrame
results

  results = results.append(pd.DataFrame([['Random Forest', rf_accuracy, rf_f1, rf_precision, rf_recall]],


Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,Logistic Regression,0.780488,0.852459,0.757282,0.975
1,Decision Tree,0.699187,0.764331,0.779221,0.75
2,Random Forest,0.739837,0.818182,0.75,0.9


## Model 4: Gradient Boosting Machines (GBMs)

In [1264]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)

# Train the model using the training data
gb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_gb = gb_model.predict(X_test)

# Evaluate the model
gb_accuracy = accuracy_score(y_test, y_pred_gb)
gb_f1 = f1_score(y_test, y_pred_gb)
gb_precision = precision_score(y_test, y_pred_gb)
gb_recall = recall_score(y_test, y_pred_gb)

# Add the Gradient Boosting model results to the DataFrame
results = results.append(pd.DataFrame([['Gradient Boosting', gb_accuracy, gb_f1, gb_precision, gb_recall]],
                                      columns=['Model', 'Accuracy', 'F1', 'Precision', 'Recall']),
                         ignore_index=True)

# Print the updated results DataFrame
results

  results = results.append(pd.DataFrame([['Gradient Boosting', gb_accuracy, gb_f1, gb_precision, gb_recall]],


Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,Logistic Regression,0.780488,0.852459,0.757282,0.975
1,Decision Tree,0.699187,0.764331,0.779221,0.75
2,Random Forest,0.739837,0.818182,0.75,0.9
3,Gradient Boosting,0.731707,0.811429,0.747368,0.8875


## Model 5: Support Vector Machines (SVMs)

In [1265]:
from sklearn.svm import SVC
import pandas as pd

# Create an SVM model
svm_model = SVC()

# Train the model using the training data
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm)
svm_recall = recall_score(y_test, y_pred_svm)

# Add the SVM model results to the DataFrame
results = results.append(pd.DataFrame([['SVM', svm_accuracy, svm_f1, svm_precision, svm_recall]],
                                      columns=['Model', 'Accuracy', 'F1', 'Precision', 'Recall']),
                         ignore_index=True)

# Print the updated results DataFrame
results

  results = results.append(pd.DataFrame([['SVM', svm_accuracy, svm_f1, svm_precision, svm_recall]],


Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,Logistic Regression,0.780488,0.852459,0.757282,0.975
1,Decision Tree,0.699187,0.764331,0.779221,0.75
2,Random Forest,0.739837,0.818182,0.75,0.9
3,Gradient Boosting,0.731707,0.811429,0.747368,0.8875
4,SVM,0.780488,0.852459,0.757282,0.975


## Model 6: Naive Bayes

In [1266]:
from sklearn.naive_bayes import GaussianNB

# Create a Naive Bayes model
nb_model = GaussianNB()

# Train the model using the training data
nb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_nb = nb_model.predict(X_test)

# Evaluate the model
nb_accuracy = accuracy_score(y_test, y_pred_nb)
nb_f1 = f1_score(y_test, y_pred_nb)
nb_precision = precision_score(y_test, y_pred_nb)
nb_recall = recall_score(y_test, y_pred_nb)

# Add the Naive Bayes model results to the DataFrame
results = results.append(pd.DataFrame([['Naive Bayes', nb_accuracy, nb_f1, nb_precision, nb_recall]],
                                      columns=['Model', 'Accuracy', 'F1', 'Precision', 'Recall']),
                         ignore_index=True)

# Print the updated results DataFrame
results

  results = results.append(pd.DataFrame([['Naive Bayes', nb_accuracy, nb_f1, nb_precision, nb_recall]],


Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,Logistic Regression,0.780488,0.852459,0.757282,0.975
1,Decision Tree,0.699187,0.764331,0.779221,0.75
2,Random Forest,0.739837,0.818182,0.75,0.9
3,Gradient Boosting,0.731707,0.811429,0.747368,0.8875
4,SVM,0.780488,0.852459,0.757282,0.975
5,Naive Bayes,0.780488,0.852459,0.757282,0.975


## Model 7: Neural Networks

In [1267]:
from sklearn.neural_network import MLPClassifier

# Create a Neural Network model
nn_model = MLPClassifier(random_state=42)

# Train the model using the training data
nn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_nn = nn_model.predict(X_test)

# Evaluate the model
nn_accuracy = accuracy_score(y_test, y_pred_nn)
nn_f1 = f1_score(y_test, y_pred_nn)
nn_precision = precision_score(y_test, y_pred_nn)
nn_recall = recall_score(y_test, y_pred_nn)

# Add the Neural Network model results to the DataFrame
results = results.append(pd.DataFrame([['Neural Network', nn_accuracy, nn_f1, nn_precision, nn_recall]],
                                      columns=['Model', 'Accuracy', 'F1', 'Precision', 'Recall']),
                         ignore_index=True)

# Print the updated results DataFrame
results

  results = results.append(pd.DataFrame([['Neural Network', nn_accuracy, nn_f1, nn_precision, nn_recall]],


Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,Logistic Regression,0.780488,0.852459,0.757282,0.975
1,Decision Tree,0.699187,0.764331,0.779221,0.75
2,Random Forest,0.739837,0.818182,0.75,0.9
3,Gradient Boosting,0.731707,0.811429,0.747368,0.8875
4,SVM,0.780488,0.852459,0.757282,0.975
5,Naive Bayes,0.780488,0.852459,0.757282,0.975
6,Neural Network,0.756098,0.833333,0.75,0.9375


## Model 8: k-Nearest Neighbors (k-NN)


In [1268]:
from sklearn.neighbors import KNeighborsClassifier

# Create a k-NN model
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model using the training data
knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_knn = knn_model.predict(X_test)

# Evaluate the model
knn_accuracy = accuracy_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn)
knn_precision = precision_score(y_test, y_pred_knn)
knn_recall = recall_score(y_test, y_pred_knn)

# Add the k-NN model results to the DataFrame
results = results.append(pd.DataFrame([['k-Nearest Neighbors', knn_accuracy, knn_f1, knn_precision, knn_recall]],
                                      columns=['Model', 'Accuracy', 'F1', 'Precision', 'Recall']),
                         ignore_index=True)

# Print the updated results DataFrame
results

  results = results.append(pd.DataFrame([['k-Nearest Neighbors', knn_accuracy, knn_f1, knn_precision, knn_recall]],


Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,Logistic Regression,0.780488,0.852459,0.757282,0.975
1,Decision Tree,0.699187,0.764331,0.779221,0.75
2,Random Forest,0.739837,0.818182,0.75,0.9
3,Gradient Boosting,0.731707,0.811429,0.747368,0.8875
4,SVM,0.780488,0.852459,0.757282,0.975
5,Naive Bayes,0.780488,0.852459,0.757282,0.975
6,Neural Network,0.756098,0.833333,0.75,0.9375
7,k-Nearest Neighbors,0.780488,0.852459,0.757282,0.975


# Part 9: choosing the best Model

In [1269]:
# the best model is SVMs
results

Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,Logistic Regression,0.780488,0.852459,0.757282,0.975
1,Decision Tree,0.699187,0.764331,0.779221,0.75
2,Random Forest,0.739837,0.818182,0.75,0.9
3,Gradient Boosting,0.731707,0.811429,0.747368,0.8875
4,SVM,0.780488,0.852459,0.757282,0.975
5,Naive Bayes,0.780488,0.852459,0.757282,0.975
6,Neural Network,0.756098,0.833333,0.75,0.9375
7,k-Nearest Neighbors,0.780488,0.852459,0.757282,0.975


# New Section

# Part 10: Hyper tunning: Appying the best model into our dataset

In [1270]:
# from sklearn.linear_model import LogisticRegression

# # Define the logistic regression model
# logreg_model = LogisticRegression()

# # Define the hyperparameter distribution to search over
# param_dist = {
#     'C': uniform(0.001, 1.0),
#     'penalty': ['l1', 'l2']
# }

# # Perform randomized search to find the best hyperparameters
# random_search = RandomizedSearchCV(logreg_model, param_dist, cv=5, n_iter=10)
# random_search.fit(X_train, y_train)

# # Get the best hyperparameters and the best model
# best_params = random_search.best_params_
# best_model = random_search.best_estimator_

# # Print the best hyperparameters
# print("Best Hyperparameters:")
# print(best_params)

# # Evaluate the best model using cross-validation
# cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)
# print("Cross-Validation Scores:")
# print(cv_scores)

# # Train the best model on the entire training data
# best_model.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = best_model.predict(X_test)

# # Evaluate the model on the test set
# accuracy = accuracy_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred)
# recall = recall_score(y_test, y_pred)

# # Print the evaluation metrics
# print("Test Set Metrics:")
# print("Accuracy:", accuracy)
# print("F1-score:", f1)
# print("Precision:", precision)
# print("Recall:", recall)

In [1271]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from scipy.stats import uniform

# Define the SVM model
svm_model = SVC()

# Define the hyperparameter distribution to search over
param_dist = {
    'C': uniform(0.001, 1.0),
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

# Perform randomized search to find the best hyperparameters
random_search = RandomizedSearchCV(svm_model, param_dist, cv=5, n_iter=10)
random_search.fit(X_train, y_train)

# Get the best hyperparameters and the best model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:")
print(best_params)

# Evaluate the best model using cross-validation
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)
print("Cross-Validation Scores:")
print(cv_scores)

# Train the best model on the entire training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model on the test set
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the evaluation metrics
print("Test Set Metrics:")
print("Accuracy:", accuracy)
print("F1-score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Best Hyperparameters:
{'C': 0.8948809174748832, 'gamma': 'scale', 'kernel': 'linear'}
Cross-Validation Scores:
[0.84848485 0.80612245 0.82653061 0.78571429 0.80612245]
Test Set Metrics:
Accuracy: 0.7886178861788617
F1-score: 0.8586956521739131
Precision: 0.7596153846153846
Recall: 0.9875


## Finalzing the best model: SVCs

In [1272]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Define the SVC model with the best hyperparameters
best_model = SVC(C=best_params['C'], kernel=best_params['kernel'], gamma=best_params['gamma'])

# Train the final model on the entire training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model on the test set
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the evaluation metrics
print("Test Set Metrics:")
print("Accuracy:", accuracy)
print("F1-score:", f1)
print("Precision:", precision)
print("Recall:", recall)

Test Set Metrics:
Accuracy: 0.7886178861788617
F1-score: 0.8586956521739131
Precision: 0.7596153846153846
Recall: 0.9875


# Part 11: Predicint unseen data

In [1273]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,TotalIncome
0,1,0,0,1,0,146.412162,360.0,1.0,1,5849.0
1,1,1,1,1,0,128.0,360.0,1.0,0,6091.0
2,1,1,0,1,1,66.0,360.0,1.0,1,3000.0
3,1,1,0,0,0,120.0,360.0,1.0,1,4941.0
4,1,0,0,1,0,141.0,360.0,1.0,1,6000.0


In [1274]:
# gender, married, dependents, education, seflt employd, loanAmount, loan_amunt_term, credit_history, toal income
single_obs = [[1,	1,	2,	1,	0,	350,	360,	1, 465]]

In [1275]:
best_model.predict(scaler.transform(single_obs))




array([1])