In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRFClassifier

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
print(f'features with null values in train {train.columns[train.isna().any()].tolist()}')
print(f'features with null values in test {test.columns[test.isna().any()].tolist()}')

col_stats = pd.DataFrame(columns=['feature', 'dtype', 'nunique'])
for col in train.columns:
    col_stats.loc[len(col_stats)] = {'feature': col, 'dtype': type(train.loc[0, col]), 'nunique': train[col].nunique()}
print(col_stats)

features with null values in train ['Age', 'Cabin', 'Embarked']
features with null values in test ['Age', 'Fare', 'Cabin']
        feature                    dtype  nunique
0   PassengerId    <class 'numpy.int64'>      891
1      Survived    <class 'numpy.int64'>        2
2        Pclass    <class 'numpy.int64'>        3
3          Name            <class 'str'>      891
4           Sex            <class 'str'>        2
5           Age  <class 'numpy.float64'>       88
6         SibSp    <class 'numpy.int64'>        7
7         Parch    <class 'numpy.int64'>        7
8        Ticket            <class 'str'>      681
9          Fare  <class 'numpy.float64'>      248
10        Cabin          <class 'float'>      147
11     Embarked            <class 'str'>        3


In [4]:
for i, data in enumerate([train, test]):
    data.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)
    data['Sex'] = data['Sex'].apply(lambda s: 0 if s == 'female' else 1)
    enc_embarked = pd.get_dummies(data['Embarked'], prefix='Embarked')
    data = pd.concat([data.drop(columns=['Embarked'],), enc_embarked], axis=1)

    if i == 0:
        train = data
    else:
        test = data

In [5]:
for data in [train, test]:
    numeric_cols = data.select_dtypes(exclude=['object']).columns
    print(numeric_cols)
    for col in numeric_cols:
        median = data[col].median()
        data.fillna(median, inplace=True)

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')


In [6]:
X_train = train.drop(columns=['Survived'])
y_train = train['Survived']
for col in X_train.columns:
    print(f'type of {col} is {type(X_train.loc[0, col])}')

type of PassengerId is <class 'numpy.int64'>
type of Pclass is <class 'numpy.int64'>
type of Sex is <class 'numpy.int64'>
type of Age is <class 'numpy.float64'>
type of SibSp is <class 'numpy.int64'>
type of Parch is <class 'numpy.int64'>
type of Fare is <class 'numpy.float64'>
type of Embarked_C is <class 'numpy.bool_'>
type of Embarked_Q is <class 'numpy.bool_'>
type of Embarked_S is <class 'numpy.bool_'>


In [9]:
grid = {
    'n_estimators': [50, 75, 100, 150, 200, 250, 300, 350, 400, 500],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [5, 8, 10, 15, 20, 25, 30, 35, 40],
}

searcher = GridSearchCV(estimator = RandomForestClassifier(), param_grid = grid)
searcher.fit(X_train, y_train)
print(searcher.best_params_)

{'criterion': 'gini', 'max_depth': 5, 'n_estimators': 300}


In [9]:
#model = RandomForestClassifier(n_estimators=300, max_depth=5, random_state=42)

model = XGBRFClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=1, 
    subsample=0.8,
    colsample_bynode=0.8,
    random_state=42,
)

model.fit(X_train, y_train)
preds = model.predict(test)
output_df = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': preds})
output_df.to_csv('./out.csv', index=False)