In [1]:
import pandas as pd

passengers = pd.read_csv("Data/train.csv")
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
x = passengers.drop(columns=["Survived"])
y = passengers["Survived"]

In [3]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer


class SumTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column1, column2):
        self.column1 = column1
        self.column2 = column2
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return (X[self.column1] + X[self.column2] + 1).to_frame()
    
    def get_feature_names_out(self, input_features=None):
        return ["FamilySize"]
    

class CutTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column, bins, labels, name):
        self.bins = bins
        self.labels = labels
        self.column = column
        self.name = name

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=[self.column])
        
        X[self.name] = pd.cut(X[self.column], bins=self.bins, labels=self.labels)
        return X[[self.name]].astype(str).to_numpy()

    def get_feature_names_out(self, input_features=None):
        return np.array([self.name])

In [4]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder()
)

age_bins = [0, 3, 12, 20, 60, 200]
age_labels = ["toddler", "child", "teen", "adult", "senior"]
age_group_pipeline = make_pipeline(
    SimpleImputer(strategy="mean"),
    CutTransformer(column="Age", bins=age_bins, labels=age_labels, name="AgeGroup"),
    OneHotEncoder()
)



In [5]:
from sklearn.compose import ColumnTransformer

preprocessing = ColumnTransformer([
    ("cat", cat_pipeline, ["Sex", "Embarked", "Pclass"]),
    ("age_group", age_group_pipeline, ["Age"]),
    ("family_size", SumTransformer(column1="SibSp", column2="Parch"), ["SibSp", "Parch"]),
    ("dropper", "drop", ["Name", "Ticket", "SibSp", "Parch", "PassengerId", "Fare", "Cabin"]),
], remainder="passthrough")

In [6]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(
    max_depth=10,
    max_leaf_nodes=10,
    max_features=20,
    n_estimators=400
)

model = make_pipeline(preprocessing, rfc)
model.fit(x, y)
model.score(x,y)

0.8327721661054994

In [7]:
from sklearn.model_selection import cross_val_score

cross_val_score(model, x, y)

array([0.7877095 , 0.8258427 , 0.81460674, 0.78651685, 0.84831461])

In [8]:
test = pd.read_csv("Data/test.csv")

prediction = model.predict(test)
prediction_df = pd.DataFrame()
prediction_df['PassengerId'] = test['PassengerId']
prediction_df['Survived'] = prediction

prediction_df.to_csv("submission.csv", index=False)