In [73]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
import sys
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from dataclasses import dataclass
import os 
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,roc_auc_score,confusion_matrix
from sklearn.model_selection import train_test_split


In [41]:
df = pd.read_csv('../datasets/loan_approval_dataset.csv').head(1000)

In [42]:
df.columns

Index(['Unnamed: 0', 'Id', 'Income', 'Age', 'Experience', 'Married/Single',
       'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE',
       'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS', 'Risk_Flag'],
      dtype='object')

In [43]:
df.drop(columns=['Unnamed: 0'])

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,6907195,55,5,single,owned,no,Police_officer,Dharmavaram,Andhra_Pradesh,3,10,0
996,997,9253093,54,2,single,rented,yes,Secretary,Danapur,Bihar,2,12,1
997,998,9464966,67,14,single,norent_noown,yes,Statistician,Tiruppur,Tamil_Nadu,14,10,0
998,999,3346084,23,18,single,rented,no,Web_designer,Sambalpur,Odisha,6,13,0


In [44]:
X = df.drop(columns=['Unnamed: 0','Risk_Flag'])
y=df['Risk_Flag']

In [45]:
X,y

(       Id   Income  Age  Experience Married/Single House_Ownership  \
 0       1  1303834   23           3         single          rented   
 1       2  7574516   40          10         single          rented   
 2       3  3991815   66           4        married          rented   
 3       4  6256451   41           2         single          rented   
 4       5  5768871   47          11         single          rented   
 ..    ...      ...  ...         ...            ...             ...   
 995   996  6907195   55           5         single           owned   
 996   997  9253093   54           2         single          rented   
 997   998  9464966   67          14         single    norent_noown   
 998   999  3346084   23          18         single          rented   
 999  1000  5472585   63          17        married          rented   
 
     Car_Ownership           Profession                 CITY           STATE  \
 0              no  Mechanical_engineer                 Rewa  Madh

In [59]:
@dataclass
class DataTransformationConfig:
    preprocessor_obj_path = os.path.join("artifacts","preprocessor.pkl")


class DataTransformation:
    def __init__(self):
        self.data_transformation_config= DataTransformationConfig()

    def get_data_transformer_object(self):
        """  This method is used to get the data transformation object i.e preprocessor object which is used to transform the data"""

            # Wrap in a list to make it a row
        numeric_features = ['Income', 'Age', 'Experience', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS']
    
        categorical_features =  ['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession',
    'CITY', 'STATE']
        num_pipeline = Pipeline(
            steps=[
                ('imputer',SimpleImputer(strategy='median')),
                ('scaler',StandardScaler())
            ]
        )
        
        cat_pipeline = Pipeline(
            steps=[
                ('imputer',SimpleImputer(strategy='most_frequent')),
                ('onehot',OneHotEncoder(handle_unknown='ignore')),
                ('scaler',StandardScaler(with_mean=False))
            ]
        )
        preprocessor = ColumnTransformer(
            [
                ('num_pipeline',num_pipeline,numeric_features),
                ('cat_pipeline',cat_pipeline,categorical_features)
            ]
        )
        return preprocessor

    def initiate_data_transformation(self,train_path,test_path):

            train_df = pd.read_csv(train_path)
            test_df = pd.read_csv(test_path)
            preprocessor_obj = self.get_data_transformer_object()
            target_column_name="Risk_Flag"  
            cat_features = ['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession',
                'CITY', 'STATE']
            num_features = ['Income', 'Age', 'Experience', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS',]
      
            numeric_features =  num_features
            categorical_features = cat_features
            
            input_feature_train_df = train_df.drop(columns=[target_column_name],axis=1)
            target_feature_train_df = train_df[target_column_name]

            input_feature_test_df = test_df.drop(columns=[target_column_name],axis=1)
            target_feature_test_df = test_df[target_column_name]


            input_feature_train_arr=preprocessor_obj.fit_transform(input_feature_train_df)
            # input_feature_test_arr=preprocessor_obj.transform(input_feature_test_df)
            input_feature_test_arr=preprocessor_obj.transform(input_feature_test_df)

            # train_arr = np.c_[input_feature_train_arr,np.array(target_feature_train_df)]
            # test_arr = np.c_[input_feature_test_arr,np.array(target_feature_test_df)]
            print("Train transformed features shape:", input_feature_train_arr.shape)
            print("Train target shape:", np.array(target_feature_train_df).shape)
            print("Test transformed features shape:", input_feature_test_arr.shape)
            print("Test target shape:", np.array(target_feature_test_df).shape)


            train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df).reshape(-1, 1)]
            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df).reshape(-1, 1)]



            return (train_arr,
                    test_arr,
                    self.data_transformation_config.preprocessor_obj_path)


In [60]:
numeric_features = ['Income', 'Age', 'Experience', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS']

categorical_features =  ['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession',
'CITY', 'STATE']
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehot',OneHotEncoder(handle_unknown='ignore')),
        ('scaler',StandardScaler(with_mean=False))
    ]
)

preprocessor = ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numeric_features),
        ('cat_pipeline',cat_pipeline,categorical_features)
    ]
)

In [71]:
import pickle
with open('preprocessor.pkl','wb') as file:
    pickle.dump(preprocessor,file)
with open('preprocessor.pkl','rb') as file:
    loaded_preprocessor = pickle.load(file)

In [61]:
X_transform = preprocessor.fit_transform(X)

In [72]:
loaded_preprocessor.transform(X)

<1000x399 sparse matrix of type '<class 'numpy.float64'>'
	with 11000 stored elements in Compressed Sparse Row format>

In [49]:
X_transform

<1000x399 sparse matrix of type '<class 'numpy.float64'>'
	with 11000 stored elements in Compressed Sparse Row format>

In [50]:
X.shape

(1000, 12)

In [75]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.shape,test_df.shape

((800, 14), (200, 14))

In [76]:
target_column_name="Risk_Flag"  
input_feature_train_df = train_df.drop(columns=[target_column_name],axis=1)
target_feature_train_df = train_df[target_column_name]

input_feature_test_df = test_df.drop(columns=[target_column_name],axis=1)
target_feature_test_df = test_df[target_column_name]

In [80]:
input_feature_test_df.shape,target_feature_test_df.shape

((200, 13), (200,))

In [82]:
input_feature_train_df.shape,target_feature_train_df.shape

((800, 13), (800,))

In [87]:
input_feature_train_df

Unnamed: 0.1,Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS
29,29,30,4386333,31,16,single,rented,no,Physician,Shimoga,Karnataka,3,12
535,535,536,1171901,57,10,single,norent_noown,yes,Financial_Analyst,Buxar[37],Bihar,9,12
695,695,696,1512050,70,7,single,rented,no,Librarian,Sasaram[30],Bihar,7,14
557,557,558,9148621,43,9,single,rented,no,Police_officer,Bilaspur,Chhattisgarh,3,12
836,836,837,5272729,28,17,single,rented,no,Computer_operator,Guwahati,Assam,12,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,106,107,5165629,61,0,single,rented,no,Aviator,North_Dumdum,West_Bengal,0,12
270,270,271,2452563,75,3,single,rented,no,Hotel_Manager,Kulti,West_Bengal,3,12
860,860,861,4505937,52,6,single,rented,no,Petroleum_Engineer,Mangalore,Karnataka,6,14
435,435,436,8491491,61,20,single,rented,no,Comedian,Haldia,West_Bengal,8,11


In [92]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_transform,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape,y_train.shape,y_test.shape

((800, 399), (200, 399), (800,), (200,))

In [86]:
X_train

<800x399 sparse matrix of type '<class 'numpy.float64'>'
	with 8800 stored elements in Compressed Sparse Row format>

In [55]:
from sklearn.tree import DecisionTreeClassifier

In [93]:
ds = DecisionTreeClassifier()
ds.fit(X_train,y_train)
predictions=ds.predict(X_test)

In [57]:
predictions

array([0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0], dtype=int64)

In [63]:
print(accuracy_score(y_test,predictions))
print(f1_score(y_test,predictions))
print(recall_score(y_test,predictions))
print(precision_score(y_test,predictions))
print(confusion_matrix(y_test,predictions))

0.74
0.21212121212121213
0.22580645161290322
0.2
[[141  28]
 [ 24   7]]


In [64]:
import pickle
with open('model.pkl','wb') as file:
    pickle.dump(ds,file)


# Load the model from the pickle file
with open('model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Make predictions with the loaded model
predictions = loaded_model.predict(X_test)


In [65]:
predictions

array([0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0], dtype=int64)

In [67]:
lst = df.head(1)

In [70]:
loaded_model.predict(preprocessor.transform(lst))

array([0], dtype=int64)

In [96]:
dct = {
    'name':['ehtesham'],
    'rollno':[31]
}
pd.DataFrame(dct)

Unnamed: 0,name,rollno
0,ehtesham,31


In [101]:
dct['name']

['ehtesham']