In [1]:
import pandas as pd

#Config for Displaying Pipeline
from sklearn import set_config
set_config(display='diagram')

# Make and Compose Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

# Preprocessing 
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.impute import SimpleImputer

#Model
from sklearn.ensemble import RandomForestClassifier


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

### Divide Categorical and Numerical Features

In [4]:
#create X and y datasets for splitting 
X = train.drop(['Survived', 'Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1)
y = train['Survived']

In [5]:
numerical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind in ['i','f'] and c !='PassengerId']
categorical_features = [c for c, dtype in zip(X.columns, X.dtypes)
                     if dtype.kind not in ['i','f']]

In [6]:
print('Numerical : ' + str(numerical_features))
print('Categorical : ' + str(categorical_features))

Numerical : ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Categorical : ['Sex', 'Embarked']


### Divide Data into Train and Test

In [7]:
#import train_test_split library
from sklearn.model_selection import train_test_split

# create train test split
X_train, X_test, y_train, y_test = train_test_split( X,  y, test_size=0.3, random_state=0)  

### Make Pipeline with Column Transformer

In [8]:
preprocessor = make_column_transformer(
    
    (make_pipeline(
    SimpleImputer(strategy = 'median'),
    KBinsDiscretizer(n_bins=3)), numerical_features),
    
    (make_pipeline(
    SimpleImputer(strategy = 'constant', fill_value = 'missing'),
    OneHotEncoder(categories = 'auto', handle_unknown = 'ignore')), categorical_features),
)

### Make Pipeline for Building Random Forest Model

In [12]:
RF_Model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators = 100))

In [13]:
RF_Model.fit(X_train, y_train)

  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)


In [11]:
print(f'Test : {RF_Model.score(X_test, y_test):.3f}')
print(f'Train : {RF_Model.score(X_train, y_train):.3f}')

Test : 0.761
Train : 0.803


### Prediction of Test

In [45]:
sub_test = test.drop(['PassengerId'], axis = 1)

In [46]:
sub_test_pred = RF_Model.predict(sub_test).astype(int)

In [47]:
AllSub = pd.DataFrame({ 'PassengerId': test['PassengerId'],
                       'Survived' : sub_test_pred
    
})

AllSub.to_csv("Solution_Pipeline_RF.csv", index = False)

In [48]:
#Kaggle LB Score - 0.77033