In [1]:
import polars as pl
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

from xgboost import XGBClassifier
from skopt import BayesSearchCV
import skopt.space as ss

import sys
sys.path.append("/Users/leon/Desktop/kaggle/")
import src.titanic.transformer_impl as ti

# Data Loading & Transformation

In [2]:
df = pl.read_csv("./data/train.csv")
test_df = pl.read_csv("./data/test.csv")

## Training Data Transformations

In [3]:
tfr = ti.TitanicTransformer()
data = tfr.transform(df)
X = data[:, :-1]
y = data[:, -1].ravel()
X.shape

(891, 27)

## Test Data Transformations

In [6]:
X_test = tfr.transform(test_df)

# Best Model

In [4]:
from sklearn.model_selection import train_test_split
Xtr, Xte, ytr, yte = train_test_split(X, y, shuffle=True)
rf_clf2 = RandomForestClassifier()
rf_clf2.fit(Xtr, ytr)
rf_clf2.score(Xte, yte)

0.820627802690583

In [8]:
from sklearn.model_selection import GridSearchCV

rf_clf = RandomForestClassifier()
params = [
	{"n_estimators": [99, 100, 101], "max_features": [10, 11, 12]}
]

res = GridSearchCV(
	estimator=rf_clf,
	param_grid=params,
	cv=5,
	scoring="accuracy"
).fit(Xtr, ytr)

In [9]:
final_model = RandomForestClassifier(
	n_estimators=11,
	max_features=11
).fit(X, y)

In [None]:
solutions = {}

for row in test_df.iter_rows(named=True):
	id = row["PassengerId"]
	x = tfr.transform(pl.DataFrame(row))
	solutions[id] = final_model.predict(x)

with open(file="./submission.csv", mode="w") as f:
	f.write("PassengerId,Survived\n")
	for k in solutions.keys():
		f.write(f"{k},{int(solutions[k][0])}\n")

In [None]:
model = XGBClassifier()

spaces = {
	"n_estimators": ss.Integer(1000, 1500),
	"learning_rate": ss.Real(0.01, 0.2)
}

sss = StratifiedShuffleSplit(n_splits=1, random_state=42)
for train_index, test_index in sss.split(X, y):
	X_train, X_test = X[train_index], X[test_index]
	y_train, y_test = y[train_index], y[test_index]

	opt = BayesSearchCV(
		estimator=model,
		search_spaces=spaces,
		cv=5,
	).fit(X_train, y_train)

In [4]:
sss = StratifiedShuffleSplit(n_splits=1, random_state=42)
for train_index, test_index in sss.split(X, y):
	X_train, X_test = X[train_index], X[test_index]
	y_train, y_test = y[train_index], y[test_index]
	model2 = XGBClassifier(
		n_estimators=1100,
		early_stopping_rounds=50,
		learning_rate=0.01
	).fit(
		X_train,
		y_train,
		eval_set=[(X_train, y_train), (X_test, y_test)],
		# verbose=True
	)
model2.score(X_test, y_test)

NameError: name 'X' is not defined

In [6]:
sss = StratifiedShuffleSplit(n_splits=1, random_state=42)
for train_index, test_index in sss.split(X, y):
	X_train, X_test = X[train_index], X[test_index]
	y_train, y_test = y[train_index], y[test_index]
	model3 = XGBClassifier(
		n_estimators=1100,
		early_stopping_rounds=50,
		learning_rate=0.01
	).fit(
		X_train,
		y_train,
		eval_set=[(X_train, y_train), (X_test, y_test)],
		# verbose=True
	)
model3.score(X_test, y_test)

[0]	validation_0-logloss:0.66081	validation_1-logloss:0.66417
[1]	validation_0-logloss:0.65609	validation_1-logloss:0.66012
[2]	validation_0-logloss:0.65146	validation_1-logloss:0.65615
[3]	validation_0-logloss:0.64692	validation_1-logloss:0.65227
[4]	validation_0-logloss:0.64248	validation_1-logloss:0.64847
[5]	validation_0-logloss:0.63813	validation_1-logloss:0.64475
[6]	validation_0-logloss:0.63386	validation_1-logloss:0.64110
[7]	validation_0-logloss:0.62968	validation_1-logloss:0.63753
[8]	validation_0-logloss:0.62558	validation_1-logloss:0.63404
[9]	validation_0-logloss:0.62156	validation_1-logloss:0.63061
[10]	validation_0-logloss:0.61761	validation_1-logloss:0.62726
[11]	validation_0-logloss:0.61374	validation_1-logloss:0.62397
[12]	validation_0-logloss:0.60995	validation_1-logloss:0.62075
[13]	validation_0-logloss:0.60623	validation_1-logloss:0.61759
[14]	validation_0-logloss:0.60257	validation_1-logloss:0.61449
[15]	validation_0-logloss:0.59898	validation_1-logloss:0.61143
[1

0.7777777777777778

In [7]:
solutions = {}

for row in test_df.iter_rows(named=True):
	id = row["PassengerId"]
	x = tfr.transform(pl.DataFrame(row))
	solutions[id] = model3.predict(x)

with open(file="./submission.csv", mode="w") as f:
	f.write("PassengerId,Survived\n")
	for k in solutions.keys():
		f.write(f"{k},{int(solutions[k][0])}\n")

In [8]:
sss = StratifiedShuffleSplit(n_splits=1, random_state=42)

param_grid = {
	"n_estimators": [1400, 1500, 1700],
	"learning_rate": [0.01, 0.02]
}

for train_index, test_index in sss.split(X, y):
	X_train, X_test = X[train_index], X[test_index]
	y_train, y_test = y[train_index], y[test_index]

	opt = GridSearchCV(
		estimator=model,
		param_grid=param_grid,
		cv=5,
		verbose=True
	).fit(X_train, y_train)
print(opt.best_estimator_.score(X_test, y_test))
print(opt.best_params_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
0.7777777777777778
{'learning_rate': 0.01, 'n_estimators': 1500}


In [22]:
solutions = {}

for row in test_df.iter_rows(named=True):
	id = row["PassengerId"]
	x = tfr.transform(pl.DataFrame(row))
	solutions[id] = opt.best_estimator_.predict(x)

with open(file="./submission.csv", mode="w") as f:
	f.write("PassengerId,Survived\n")
	for k in solutions.keys():
		f.write(f"{k},{int(solutions[k][0])}\n")

0.8222222222222222
{'learning_rate': 0.01, 'n_estimators': 1500}
