Load data

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

train = pd.read_csv('data/loan_train.csv')
train.dtypes

Create training and test data

In [None]:
from sklearn.model_selection import train_test_split
X = train.drop('Loan_Status', axis=1)
y = train['Loan_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Define a pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
# replace missing numerical values with median value for that column
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
# replace missing categorical values with the 'missing' string
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),('one-hot-encoder', OneHotEncoder(handle_unknown='ignore'))])

Combine the preprocessors and divide column into numeric and categorical variables

In [None]:
from sklearn.compose import ColumnTransformer

numeric_features = train.select_dtypes(include=['int64','float64']).columns
categorical_features = train.select_dtypes(include=['object']).drop(['Loan_Status'], axis=1).columns
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)])

Create pipeline with preprocessor and random forest classificator

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier(random_state=0))])

Train the classifier and print the accuracy score

In [None]:
from sklearn.metrics import accuracy_score
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)