In [9]:
# Basic libraries
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from google.colab import files
uploaded = files.upload()

# Load the CSV into a DataFrame
df = pd.read_csv('loan.csv')


#Exploring the Dataset
df.head()                         # Check the first few rows
df.info()                         # Dataset information
df.describe()                     # Statistical summary
df.isnull().sum()                 # Check for missing values


#Data Cleaning and Preprocessing
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)        # Fill missing values for categorical columns with mode
df['Married'].fillna(df['Married'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True) # Fill missing values for numerical columns with median
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median(), inplace=True)
df['Credit_History'].fillna(df['Credit_History'].median(), inplace=True)
label_enc = LabelEncoder()                                       # Encode categorical variables using LabelEncoder
df['Gender'] = label_enc.fit_transform(df['Gender'])
df['Married'] = label_enc.fit_transform(df['Married'])
df['Education'] = label_enc.fit_transform(df['Education'])
df['Self_Employed'] = label_enc.fit_transform(df['Self_Employed'])
df['Property_Area'] = label_enc.fit_transform(df['Property_Area'])
df['Loan_Status'] = label_enc.fit_transform(df['Loan_Status'])
df['Dependents'] = df['Dependents'].replace('3+', 3).astype(int)  # Encode 'Dependents' (handle the '3+' case)
df.drop('Loan_ID', axis=1, inplace=True)                          # Drop the Loan_ID column(Dropping Irrelevant column)
print(df.isnull().sum())                                          # Check for any remaining missing values
df.head()                                                         # View the first few rows of the cleaned dataset

# Descriptive Statistics
print("\nDescriptive statistics of the cleaned dataset:")
print(df.describe())

# Correlation Matrix
print("\nCorrelation matrix:")
corr_matrix = df.corr()
print(corr_matrix)



# Features and target variable
X = df.drop('Loan_Status', axis=1)                                # Drop the target column to get features
y = df['Loan_Status']                                             # Target column


# Split the dataset(80-20 split for training and testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#Feature Scaling
scaler = StandardScaler()                                          # Initialize the scaler
X_train = scaler.fit_transform(X_train)                            # Scale the training and testing feature sets
X_test = scaler.transform(X_test)


#Random Forest Classifier
model = RandomForestClassifier(random_state=42)                    # Initialize the Random Forest Classifier
model.fit(X_train, y_train)                                        # Train the model on the training data
y_pred = model.predict(X_test)                                     # Make predictions on the test set
accuracy = accuracy_score(y_test, y_pred)                          # Evaluate the model
print(f"Accuracy: {accuracy:.2f}")
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_report(y_test, y_pred))


#Logistic Regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=1000, random_state=42)      # Initialize and train the model
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)                          # Make predictions
print("Logistic Regression Results:")                             # Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred_log_reg):.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))
print("Classification Report:")
print(classification_report(y_test, y_pred_log_reg))


#Support vector machine (SVM)
from sklearn.svm import SVC
svm_model = SVC(kernel='linear', random_state=42)                # Initialize and train the model
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)                           # Make predictions
print("\nSupport Vector Machine Results:")                       # Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))
print("Classification Report:")
print(classification_report(y_test, y_pred_svm))


#Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=42)              # Initialize and train the model
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)                            # Make predictions
print("\nDecision Tree Classifier Results:")                    # Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt):.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))
print("Classification Report:")
print(classification_report(y_test, y_pred_dt))


#K-Nearest Neighbors (KNN)
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)                 # Initialize and train the model
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)                          # Make predictions
print("\nK-Nearest Neighbors Results:")                         # Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn):.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))
print("Classification Report:")
print(classification_report(y_test, y_pred_knn))


#Cross validation
# List of models to evaluate
models = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine (SVM)': SVC(),
    'Decision Tree Classifier': DecisionTreeClassifier(random_state=42),
    'K-Nearest Neighbors (KNN)': KNeighborsClassifier(),
    'Random Forest Classifier': RandomForestClassifier(random_state=42)
}

# Perform k-fold cross-validation for each model
k = 5
for model_name, model in models.items():
    # Perform cross-validation
    cv_scores = cross_val_score(model, X, y, cv=k, scoring='accuracy')

    # Print the cross-validation results for the model
    print(f"\n{model_name}:")
    print(f"Cross-Validation Scores (k={k}): {cv_scores}")
    print(f"Mean Accuracy: {np.mean(cv_scores):.2f}")
    print(f"Standard Deviation: {np.std(cv_scores):.2f}")




Saving loan.csv to loan (8).csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
Gender               0
Married              0
Dependents           0
Education  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)        # Fill missing values for categorical columns with mode
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Married'].fillna(df['Married'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplac

Accuracy: 0.75
Confusion Matrix:
[[18 25]
 [ 6 74]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.42      0.54        43
           1       0.75      0.93      0.83        80

    accuracy                           0.75       123
   macro avg       0.75      0.67      0.68       123
weighted avg       0.75      0.75      0.73       123

Logistic Regression Results:
Accuracy: 0.79
Confusion Matrix:
[[18 25]
 [ 1 79]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123


Support Vector Machine Results:
Accuracy: 0.79
Confusion Matrix:
[[18 25]
 [ 1 79]]
Classification Report:
              precision    recall  f1-score   support

    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Logistic Regression:
Cross-Validation Scores (k=5): [0.81300813 0.77235772 0.7804878  0.85365854 0.81147541]
Mean Accuracy: 0.81
Standard Deviation: 0.03

Support Vector Machine (SVM):
Cross-Validation Scores (k=5): [0.69105691 0.69105691 0.68292683 0.68292683 0.68852459]
Mean Accuracy: 0.69
Standard Deviation: 0.00

Decision Tree Classifier:
Cross-Validation Scores (k=5): [0.75609756 0.63414634 0.70731707 0.73170732 0.72131148]
Mean Accuracy: 0.71
Standard Deviation: 0.04

K-Nearest Neighbors (KNN):
Cross-Validation Scores (k=5): [0.63414634 0.6097561  0.62601626 0.58536585 0.6147541 ]
Mean Accuracy: 0.61
Standard Deviation: 0.02

Random Forest Classifier:
Cross-Validation Scores (k=5): [0.78861789 0.73170732 0.7804878  0.82113821 0.80327869]
Mean Accuracy: 0.79
Standard Deviation: 0.03
