In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score

In [None]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
# Remove rows with missing target, separate target from predictors
train_df.dropna(axis=0, subset=['Survived'], inplace=True)
y = train_df.Survived
train_df.drop(['Survived'], axis=1, inplace=True)

In [None]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(train_df, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [None]:

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train.columns if
                    X_train[cname].nunique() < 10 and 
                    X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if 
                X_train[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train[my_cols].copy()
X_valid = X_valid[my_cols].copy()
X_test = test_df[my_cols].copy()

In [None]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='most_frequent')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
from xgboost import XGBRegressor, XGBClassifier

# Define model

X_train_transformed = preprocessor.fit_transform(X_train)
X_valid_transformed = preprocessor.transform(X_valid)

def xgb_model(n_est, lr, pred):
    xg_model = XGBClassifier(n_estimators=n_est, learning_rate=lr)
    xg_model.fit(X_train_transformed, y_train, eval_set=[(X_valid_transformed, y_valid)], verbose=False)
    preds = xg_model.predict(pred)
    mae = mean_absolute_error(y_valid, preds)
    acc = accuracy_score(y_valid, preds)

   # print(f"MAE: {mae}")
    return xg_model, mae, acc

best_mae = float('inf')
best_model = None
best_acc = 0 

n_values = np.linspace(180, 220, 25)
l_values = np.linspace(0.09, 0.11, 20)

# Hyperparameter tuning
for n in n_values:
    for l in l_values:
        model, mae, acc = xgb_model(int(n), l, X_valid_transformed)
        print(f"N est: {n}, lr: {l}, ACC: {acc}")
        if acc > best_acc:
            best_acc = acc
            best_model = model

best_model

In [None]:
# Make predictions using the best model
best_preds = best_model.predict(X_valid_transformed)

# Compute accuracy
best_model_accuracy = accuracy_score(y_valid, best_preds)

best_model_accuracy

In [None]:
# Make predictions on test data using the best model
X_test_transformed = preprocessor.transform(X_test)
test_preds = best_model.predict(X_test_transformed)

passenger_id_to_prediction = dict(zip(X_test['PassengerId'], test_preds))

df = pd.DataFrame(list(passenger_id_to_prediction.items()), columns=['PassengerId', 'Survived'])
df.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
