In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from tkinter import *
import joblib
import pandas as p

# Load the dataset
data = pd.read_csv('train_u6lujuX_CVtuZ9i.csv')

# Display the first few rows
print(data.head())
data.shape
print("Number of Rows",data.shape[0])
print("Number of Columns",data.shape[1])
data.info()
# Check Null Values In The Dataset
data.isnull().sum()


    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [12]:
# Step 2: Data Preprocessing
# Define features and target variable
X = data.drop('Loan_Status', axis=1)
y = data['Loan_Status'].map({'Y': 1, 'N': 0})  # Convert target variable to binary


In [6]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

In [7]:
# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [8]:
# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Step 4: Model Selection and Training
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(probability=True)
}

results = {}

for name, model in models.items():
    # Create a pipeline that includes preprocessing and the model
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])
    
    # Train the model
    clf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = clf.predict(X_test)
    y_pred_proba = clf.predict_proba(X_test)[:, 1]
    
    # Evaluate the model
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_pred_proba),
        'Classification Report': classification_report(y_test, y_pred)
    }


In [10]:
# Step 5: Display Results
for name, metrics in results.items():
    print(f"{name}:\n"
          f"Accuracy: {metrics['Accuracy']:.2f}\n"
          f"ROC AUC: {metrics['ROC AUC']:.2f}\n"
          f"{metrics['Classification Report']}\n")

Logistic Regression:
Accuracy: 0.79
ROC AUC: 0.75
              precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123


Decision Tree:
Accuracy: 0.67
ROC AUC: 0.62
              precision    recall  f1-score   support

           0       0.55      0.42      0.47        43
           1       0.72      0.81      0.76        80

    accuracy                           0.67       123
   macro avg       0.63      0.62      0.62       123
weighted avg       0.66      0.67      0.66       123


Random Forest:
Accuracy: 0.79
ROC AUC: 0.72
              precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.7

In [38]:
#Save the model
X = data.drop(['Loan_Status','Loan_ID'],axis=1)
y = data['Loan_Status']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object']).columns

# Impute numerical features
num_imputer = SimpleImputer(strategy='mean')
X[num_features] = num_imputer.fit_transform(X[num_features])

# Impute categorical features
cat_imputer = SimpleImputer(strategy='most_frequent')
X[cat_features] = cat_imputer.fit_transform(X[cat_features])

# Step 4: Handle categorical variables in X (if any)
X = pd.get_dummies(X, drop_first=True) 
print(X,y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=270,
 min_samples_split=5,
 min_samples_leaf=5,
 max_features='sqrt',
 max_depth=5)
rf.fit(X_train,y_train)
joblib.dump(rf,'loan_status_predict')
joblib.dump(preprocessor, 'preprocessor.pkl')


     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849.0                0.0  146.412162             360.0   
1             4583.0             1508.0  128.000000             360.0   
2             3000.0                0.0   66.000000             360.0   
3             2583.0             2358.0  120.000000             360.0   
4             6000.0                0.0  141.000000             360.0   
..               ...                ...         ...               ...   
609           2900.0                0.0   71.000000             360.0   
610           4106.0                0.0   40.000000             180.0   
611           8072.0              240.0  253.000000             360.0   
612           7583.0                0.0  187.000000             360.0   
613           4583.0                0.0  133.000000             360.0   

     Credit_History  Gender_Male  Married_Yes  Dependents_1  Dependents_2  \
0               1.0            1            0 

['preprocessor.pkl']

In [43]:
import pandas as pd
import joblib
from tkinter import *

def show_entry():
    try:
        # Get inputs
       # p0=float(e0.get())
        p1 = float(e1.get())  # Gender
        p2 = float(e2.get())  # Married
        p3 = float(e3.get())  # Dependents
        p4 = float(e4.get())  # Education
        p5 = float(e5.get())  # Self_Employed
        p6 = float(e6.get())  # ApplicantIncome
        p7 = float(e7.get())  # CoapplicantIncome
        p8 = float(e8.get())  # LoanAmount
        p9 = float(e9.get())  # Loan_Amount_Term
        p10 = float(e10.get())  # Credit_History
        p11 = e11.get()  # Property_Area (should be a category)

        # Load the model and the preprocessor
        model = joblib.load('loan_status_predict')

        # Create DataFrame
        input_data = pd.DataFrame({
            #'Loan_ID':[p0],
            'Gender': [p1],
            'Married': [p2],
            'Dependents': [p3],
            'Education': [p4],
            'Self_Employed': [p5],
            'ApplicantIncome': [p6],
            'CoapplicantIncome': [p7],
            'LoanAmount': [p8],
            'Loan_Amount_Term': [p9],
            'Credit_History': [p10],
            'Property_Area': [p11]
        })
        
        print("Input DataFrame columns:", input_data.columns)


        # Preprocess the input data using the same preprocessor
        preprocessor = joblib.load('preprocessor.pkl')  # Save the preprocessor during training
        processed_data = preprocessor.transform(input_data)

        # Make prediction
        result = model.predict(processed_data)

        # Display result
        result_text = "Loan Approved" if result[0] == 1 else "Loan Not Approved"
        result_label.config(text=result_text)  # Update existing label
    except ValueError as ve:
        result_label.config(text=f"Input error: {str(ve)}")  # Show input error
    except KeyError as ke:
        result_label.config(text=f"Key error:{str(ke)}")
    except Exception as e:
        result_label.config(text=f"Error: {str(e)}")  # Show general error

# Initialize the main window and GUI elements as before
master = Tk()
master.title("Loan Status Prediction Using Machine Learning")

# Labels
Label(master, text="Loan Status Prediction", bg="black", fg="white").grid(row=0, columnspan=2)
#Label(master,text="Loan_ID").grid(row=0)
Label(master, text="Gender [1:Male, 0:Female]").grid(row=1)
Label(master, text="Married [1:Yes, 0:No]").grid(row=2)
Label(master, text="Dependents [0, 1, 2, 3, 4]").grid(row=3)
Label(master, text="Education [0:Not Graduate, 1:Graduate]").grid(row=4)
Label(master, text="Self_Employed [1:Yes, 0:No]").grid(row=5)
Label(master, text="ApplicantIncome").grid(row=6)
Label(master, text="CoapplicantIncome").grid(row=7)
Label(master, text="LoanAmount").grid(row=8)
Label(master, text="Loan_Amount_Term").grid(row=9)
Label(master, text="Credit_History [1:Yes, 0:No]").grid(row=10)
Label(master, text="Property_Area [e.g., Urban, Rural, Semiurban]").grid(row=11)

# Entry fields
#e0=Entry(master)
e1 = Entry(master)
e2 = Entry(master)
e3 = Entry(master)
e4 = Entry(master)
e5 = Entry(master)
e6 = Entry(master)
e7 = Entry(master)
e8 = Entry(master)
e9 = Entry(master)
e10 = Entry(master)
e11 = Entry(master)

# Grid placement for entry fields
#e0.grid(row=0,column=0)
e1.grid(row=1, column=1)
e2.grid(row=2, column=1)
e3.grid(row=3, column=1)
e4.grid(row=4, column=1)
e5.grid(row=5, column=1)
e6.grid(row=6, column=1)
e7.grid(row=7, column=1)
e8.grid(row=8, column=1)
e9.grid(row=9, column=1)
e10.grid(row=10, column=1)
e11.grid(row=11, column=1)

# Button for prediction
Button(master, text="Predict", command=show_entry).grid(row=12, columnspan=2)

# Label to display result
result_label = Label(master, text="")
result_label.grid(row=13, columnspan=2)

# Start the main loop
master.mainloop()

