In [3]:
import polars as pl
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import sys
sys.path.append("/Users/leon/Desktop/kaggle/")
import src.titanic.transformer_impl as ti

# Data Loading & Transformation

In [4]:
df = pl.read_csv("./data/train.csv")
test_df = pl.read_csv("./data/test.csv")

## Training Data Transformations

In [5]:
tfr = ti.TitanicTransformer()
data = tfr.transform(df)
X = data[:, :-1]
y = data[:, -1].ravel()
X.shape

(891, 39)

## Test Data Transformations

In [6]:
X_test = tfr.transform(test_df)

# Best Model

In [7]:
from sklearn.model_selection import train_test_split
Xtr, Xte, ytr, yte = train_test_split(X, y, shuffle=True)
rf_clf2 = RandomForestClassifier()
rf_clf2.fit(Xtr, ytr)
rf_clf2.score(Xte, yte)

0.8295964125560538

In [8]:
from sklearn.model_selection import GridSearchCV

rf_clf = RandomForestClassifier()
params = [
	{"n_estimators": [99, 100, 101], "max_features": [10, 11, 12]}
]

res = GridSearchCV(
	estimator=rf_clf,
	param_grid=params,
	cv=5,
	scoring="accuracy"
).fit(Xtr, ytr)

In [9]:
final_model = RandomForestClassifier(
	n_estimators=11,
	max_features=11
).fit(X, y)

In [10]:
solutions = {}

for row in test_df.iter_rows(named=True):
	id = row["PassengerId"]
	x = tfr.transform(pl.DataFrame(row))
	solutions[id] = final_model.predict(x)

In [None]:
with open(file="./submission.csv", mode="w") as f:
	f.write("PassengerId,Survived\n")
	for k in solutions.keys():
		f.write(f"{k},{int(solutions[k][0])}\n")