In [37]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [38]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [39]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [40]:
train.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [41]:
# make copy of dataframe
train_df = train.copy()
test_df = test.copy()

In [42]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


## Data Cleaning

In [43]:
# separate y
train_y = train_df['Loan_Status'].copy()

In [44]:
# drop loan_status
train_df.drop('Loan_Status', axis=1, inplace=True)

In [45]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [46]:
# drop unncessary columns
train_df.drop('Loan_ID', axis=1, inplace=True)
test_df.drop('Loan_ID', axis=1, inplace=True)

In [47]:
# check duplicate values
train_df.duplicated().sum()

0

In [48]:
# there is a duplicate value in test set
test_df.duplicated().sum()

1

In [49]:
test_df[test_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
192,Male,No,0,Graduate,Yes,5833,0,116,360.0,1.0,Urban


In [50]:
# drop the duplicate value
test_df.drop_duplicates(inplace=True)

In [51]:
test_df.duplicated().sum()

0

### Missing value analysis

In [52]:
train_df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
dtype: int64

In [53]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(6)
memory usage: 52.9+ KB


In [54]:
train_df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [55]:
# numeric --> mean
# categorical --> mode

num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term']
cat_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
            'Credit_History', 'Property_Area']

In [56]:
# categorical
cat_imputer = SimpleImputer(strategy='most_frequent')
cat_imputer.fit(train_df[cat_cols])

train_df[cat_cols] = cat_imputer.transform(train_df[cat_cols])
test_df[cat_cols] = cat_imputer.transform(test_df[cat_cols])

In [57]:
# numerical 
num_imputer = SimpleImputer(strategy='mean')
num_imputer.fit(train_df[num_cols])

train_df[num_cols] = num_imputer.transform(train_df[num_cols])
test_df[num_cols] = num_imputer.transform(test_df[num_cols])

In [58]:
train_df.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [59]:
# feature engineering
# combine applicant income and coapplicant incom
train_df['ApplicantIncome'] = train_df['ApplicantIncome'] + train_df['CoapplicantIncome']
test_df['ApplicantIncome'] = test_df['ApplicantIncome'] + test_df['CoapplicantIncome']

# drop CoapplicantIncome
train_df.drop('CoapplicantIncome', axis=1, inplace=True)
test_df.drop('CoapplicantIncome', axis=1, inplace=True)

In [60]:
# application of Label encoder
train_df.nunique()

Gender                2
Married               2
Dependents            4
Education             2
Self_Employed         2
ApplicantIncome     554
LoanAmount          204
Loan_Amount_Term     11
Credit_History        2
Property_Area         3
dtype: int64

In [61]:
# ordinal data --> we can apply label encoder
train_df.Dependents.unique()

array(['0', '1', '2', '3+'], dtype=object)

In [62]:
# we use label encoder for this also
train_df.Property_Area.unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [63]:
# label encoder works column wise
for col in cat_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    

In [64]:
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,0,0,5849.0,146.412162,360.0,1,2
1,1,1,1,0,0,6091.0,128.0,360.0,1,0
2,1,1,0,0,1,3000.0,66.0,360.0,1,2
3,1,1,0,1,0,4941.0,120.0,360.0,1,2
4,1,0,0,0,0,6000.0,141.0,360.0,1,2


In [65]:
train_df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History',
       'Property_Area'],
      dtype='object')

In [66]:
num_cols.remove('CoapplicantIncome')

In [84]:
num_cols

['ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

In [85]:
cat_cols

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Credit_History',
 'Property_Area']

In [67]:
# log transform on numerical data
train_df[num_cols] = np.log(train_df[num_cols])
test_df[num_cols] = np.log(test_df[num_cols])

In [68]:
# scaling
minmax = MinMaxScaler()
train_df = minmax.fit_transform(train_df)
test_df = minmax.transform(test_df)

## Model building

In [69]:
# split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_df, train_y, test_size=0.3, random_state=0)

In [70]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression()
log.fit(X_train,y_train)

In [71]:
y_pred_test = log.predict(X_test)

In [72]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred_test)
print(acc)

0.827027027027027


## Serialization and deserialization

In [73]:
# serialization
import joblib
joblib.dump(log, 'model_v1.pkl')

['model_v1.pkl']

In [74]:
# deserialization
final_model = joblib.load('model_v1.pkl')

In [75]:
final_model.intercept_, final_model.coef_

(array([-2.05782461]),
 array([[ 0.02382467,  0.35479553,  0.39528663, -0.442547  , -0.04913972,
          0.04022649, -0.81819713,  0.23511056,  3.18026394,  0.22583306]]))

In [76]:
log.intercept_, log.coef_

(array([-2.05782461]),
 array([[ 0.02382467,  0.35479553,  0.39528663, -0.442547  , -0.04913972,
          0.04022649, -0.81819713,  0.23511056,  3.18026394,  0.22583306]]))

## modular programming

In [77]:
from packageA import f1

In [78]:
f1.print_fuction()

'This is function f1'

In [79]:
from packageA.sub_packageA import  f3

In [80]:
f3.print_fuction()

'This is function f3'

## Getting the parent directory

In [81]:
import packageA

In [82]:
packageA.__file__

'd:\\MLOps_udemy\\Packaging_ML_models\\Experiments\\packageA\\__init__.py'

In [83]:
# get the parent file path
import pathlib
pathlib.Path(packageA.__file__).resolve().parent

WindowsPath('D:/MLOps_udemy/Packaging_ML_models/Experiments/packageA')

## create custom data transformers

In [86]:
# key thing --> inherit - BaseEstimator, TransformerMinxin
# implement fit and transform
# accept input with __init__ method

from sklearn.base import BaseEstimator, TransformerMixin

class DemoTransformer(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X,y=None):
        return self 
    
    def transform(self, X):
        return X

In [87]:
# numerical imputation - mean

class MeanImputer(BaseEstimator, TransformerMixin):

    def __init__(self, variables=None):
        self.variables = variables

    def fit(self, X,y=None):
        self.mean_dict = {}
        for col in self.variables:
            self.mean_dict[col] = X[col].mean()
        return self 
    
    def transform(self, X):
        X = X.copy()
        for col in self.variables:
            X[col].fillna(self.mean_dict[col],inplace=True)
        return X

In [88]:
# trial
df = pd.DataFrame(np.random.randint(0,100,(10,2)), columns=['A','B'])
df.iloc[1,0] = np.nan
df.iloc[2,1] = np.nan
df.iloc[3,1] = np.nan
df.iloc[4,0] = np.nan
df

Unnamed: 0,A,B
0,78.0,21.0
1,,82.0
2,1.0,
3,59.0,
4,,32.0
5,37.0,46.0
6,76.0,88.0
7,83.0,33.0
8,67.0,49.0
9,88.0,46.0


In [89]:
mean_imputer = MeanImputer(variables=['A','B'])

In [90]:
mean_imputer.fit(df)

In [91]:
mean_imputer.mean_dict

{'A': 61.125, 'B': 49.625}

In [92]:
mean_imputer.transform(df)

Unnamed: 0,A,B
0,78.0,21.0
1,61.125,82.0
2,1.0,49.625
3,59.0,49.625
4,61.125,32.0
5,37.0,46.0
6,76.0,88.0
7,83.0,33.0
8,67.0,49.0
9,88.0,46.0


In [97]:
a = enumerate(range(10,20),0)

In [99]:
for i,j in a:
    print(f'{i}:{j}')

0:10
1:11
2:12
3:13
4:14
5:15
6:16
7:17
8:18
9:19


## check versions of packages

In [1]:
import numpy as np
np.__version__

'1.24.3'

In [2]:
import pandas as pd
pd.__version__

'1.5.3'

In [3]:
import joblib
joblib.__version__

'1.2.0'

In [4]:
import sklearn
sklearn.__version__

'1.3.0'

In [5]:
import scipy
scipy.__version__

'1.10.1'

In [6]:
import setuptools
setuptools.__version__

'68.0.0'

In [7]:
import wheel
wheel.__version__

'0.38.4'