In [35]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer

In [36]:
data = pd.read_csv("train1.csv")
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [38]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [39]:
data.isnull().sum()/data.shape[0]

Loan_ID              0.000000
Gender               0.021173
Married              0.004886
Dependents           0.024430
Education            0.000000
Self_Employed        0.052117
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.035831
Loan_Amount_Term     0.022801
Credit_History       0.081433
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [40]:
dropped_cols = ['Loan_ID']
data.drop(dropped_cols, axis = 1, inplace = True)

In [41]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [42]:
data.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [43]:
data.Dependents.value_counts()

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64

In [44]:
nan_cols = ['Gender', 'Married','Dependents','Self_Employed','LoanAmount','Loan_Amount_Term','Credit_History']

In [45]:
data['Gender_na'] = data['Gender'].isnull().astype(int)
data['Married_na'] = data['Married'].isnull().astype(int)
data['Dependents_na'] = data['Dependents'].isnull().astype(int)
data['Self_Employed_na'] = data['Self_Employed'].isnull().astype(int)
data['LoanAmount_na'] = data['LoanAmount'].isnull().astype(int)
data['Loan_Amount_Term_na'] = data['Loan_Amount_Term'].isnull().astype(int)
data['Credit_History_na'] = data['Credit_History'].isnull().astype(int)

In [46]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Gender_na,Married_na,Dependents_na,Self_Employed_na,LoanAmount_na,Loan_Amount_Term_na,Credit_History_na
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,0,0,0,0,1,0,0
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,0,0,0,0,0,0,0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,0,0,0,0,0,0,0
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,0,0,0,0,0,0,0
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,0,0,0,0,0,0,0


In [47]:
missing_value_filler = {}

In [48]:
for col in ['Gender', 'Married','Dependents','Education','Self_Employed','Property_Area']:
        missing_value_filler[col] = data[col].mode()[0] # one edge case 
        data[col] = data[col].fillna(value = missing_value_filler[col])

In [49]:
missing_value_filler

{'Gender': 'Male',
 'Married': 'Yes',
 'Dependents': '0',
 'Education': 'Graduate',
 'Self_Employed': 'No',
 'Property_Area': 'Semiurban'}

In [50]:
#data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].mean())
#data['Credit_History'] = data['Credit_History'].fillna(data['Credit_History'].median())
#categorical
#data['Gender'].fillna(data['Gender'].mode()[0], inplace=True)
#data['Married'].fillna(data['Married'].mode()[0], inplace=True)
#data['Dependents'].fillna(data['Dependents'].mode()[0], inplace=True)
#data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mode()[0], inplace=True)
#data['Credit_History'].fillna(data['Credit_History'].mode()[0], inplace=True)
#data['Self_Employed'].fillna(data['Self_Employed'].mode()[0], inplace=True)

In [51]:
#data.isnull().sum()

In [52]:

label_encoder_collection = {} # collecting my encoders
for col in data[data.columns.difference(['Loan_Status'])]:
    if str(data[col].dtype) == 'object':
        Le = LabelEncoder()
        data[col] = Le.fit_transform(data[col])
        label_encoder_collection[col] = Le 

print(len(label_encoder_collection))

6


In [53]:
label_encoder_collection

{'Dependents': LabelEncoder(),
 'Education': LabelEncoder(),
 'Gender': LabelEncoder(),
 'Married': LabelEncoder(),
 'Property_Area': LabelEncoder(),
 'Self_Employed': LabelEncoder()}

In [54]:
#col = "Loan_Status"
#X = data.loc[:,data.columns != col].values
#Y = data.loc[:,"Loan_Status"].values

features = data.columns.difference(['Loan_Status'])
target = data.columns[11]
#X = data.drop(columns = ['Loan_Status'], axis =1)
#Y = data['Loan_Status']
#X.shape,Y.shape, type(X)

In [55]:
type(features)

pandas.core.indexes.base.Index

In [105]:
#data.loc[:, X.columns]

In [56]:
imputer = KNNImputer()
data.loc[:, features] = imputer.fit_transform(data.loc[:, features].values)

In [57]:
data.isnull().sum()

Gender                 0
Married                0
Dependents             0
Education              0
Self_Employed          0
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount             0
Loan_Amount_Term       0
Credit_History         0
Property_Area          0
Loan_Status            0
Gender_na              0
Married_na             0
Dependents_na          0
Self_Employed_na       0
LoanAmount_na          0
Loan_Amount_Term_na    0
Credit_History_na      0
dtype: int64

In [58]:
X = data.loc[:, features].values
Y = data.loc[:, target].values 

In [59]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size = 0.3, random_state = 0)

In [60]:
#from sklearn.model_selection import cross_val_score
#def classify(model, x, y):
    #Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=0)
    #model.fit(Xtrain, Ytrain)
    #print("Accuracy is", model.score(Xtest, Ytest)*100)
    #score = cross_val_score(model, X, Y, cv=5)
    #print("Cross validation is",np.mean(score)*100)

In [61]:
model = RandomForestClassifier()
model.fit(Xtrain, Ytrain)
#classify(model_rf, X, Y)

RandomForestClassifier()

In [112]:
#Xtrain.shape, Ytrain.shape

In [113]:
#parameters = {'n_estimators':(100, 200, 300,400,500), 'min_samples_split':[2,3,4,5,7], 'max_features' : ['auto', 0.6]}
#model_rf = RandomForestClassifier()

#grid_search = GridSearchCV(model_rf, parameters)  


In [114]:
#grid_search.fit(Xtrain, Ytrain)

In [115]:
#print(" Testing Accuracy ", model.score(Xtest, Ytest))

In [116]:
#grid_search.best_score_

In [117]:
#grid_search.best_estimator_

In [118]:
#model_rf = RandomForestClassifier(min_samples_split=3, n_estimators=500)
#model_rf.fit(Xtest, Ytest)

In [119]:
#model_rf.score(Xtest, Ytest)

In [120]:
#from sklearn.svm import SVC

In [121]:
#model_svc = SVC()
#model_svc.fit(Xtest, Ytest)

In [122]:
#  model_svc.score(Xtest, Ytest)

## Real time prediction 

In [62]:
data = pd.read_csv("train1.csv")
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [63]:
customer_data = data.iloc[1, :].to_dict()
customer_data

{'Loan_ID': 'LP001003',
 'Gender': 'Male',
 'Married': 'Yes',
 'Dependents': '1',
 'Education': 'Graduate',
 'Self_Employed': 'No',
 'ApplicantIncome': 4583,
 'CoapplicantIncome': 1508.0,
 'LoanAmount': 128.0,
 'Loan_Amount_Term': 360.0,
 'Credit_History': 1.0,
 'Property_Area': 'Rural',
 'Loan_Status': 'N'}

In [64]:
del customer_data['Loan_Status']

In [65]:
customer_data

{'Loan_ID': 'LP001003',
 'Gender': 'Male',
 'Married': 'Yes',
 'Dependents': '1',
 'Education': 'Graduate',
 'Self_Employed': 'No',
 'ApplicantIncome': 4583,
 'CoapplicantIncome': 1508.0,
 'LoanAmount': 128.0,
 'Loan_Amount_Term': 360.0,
 'Credit_History': 1.0,
 'Property_Area': 'Rural'}

In [66]:
class Prediction:
    def __init__(self, model, imputer, missing_value_filler, label_encoders, features, target, nan_cols, dropped_cols):
        self.model = model
        self.imputer = imputer 
        self.missing_value_filler = missing_value_filler
        self.label_encoders = label_encoders
        self.features = features 
        self.target = target
        self.nan_cols = nan_cols
        self.dropped_cols = dropped_cols 

    def DoPrediction(self, customer_data_safe):
        customer_data = customer_data_safe.copy()
        # 1. Drop columns 
        for col in self.dropped_cols:
            del customer_data[col]

        # 2. Add Nan columns 
        for col in self.nan_cols:
            try:
                if np.isnan(customer_data[col]):
                    customer_data[col + "_na"] = 1
                else:
                    customer_data[col + "_na"] = 0
            except:
                customer_data[col + "_na"] = 0

        # 3. Handle Missing Values 
        for col, value in self.missing_value_filler.items():
            try:
                if np.isnan(customer_data[col]):
                    customer_data[col] = value
            except:
                continue

        # 4. Label Encode 
        for col, encoder in self.label_encoders.items():
            customer_data[col] = encoder.transform([customer_data[col]])[0]

        # 5. Imputer/Scaler + Sequencing 
        # if no value is missing then donot impute (Homework)
        vector = [customer_data[col] for col in self.features] # Sequence [IMP]
        vector = np.array(vector) # simple vector 
        vector = np.expand_dims(vector, axis = 0)
        vector = self.imputer.transform(vector)
        # vector = self.scaler.transform(vector)

        # 6. Prediction and Return 
        prediction = self.model.predict(vector)[0]
        return prediction

In [67]:
prediction = Prediction(model = model, imputer = imputer, missing_value_filler = missing_value_filler,
                        label_encoders = label_encoder_collection, features = features, target = target,
                        nan_cols = nan_cols, dropped_cols = dropped_cols)

In [68]:
prediction.DoPrediction(customer_data)

'Y'

In [69]:
import pickle 
file = open("prediction.pck", "wb")
pickle.dump(prediction, file)
file.close()

In [70]:
?pickle.dump
