In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

print("Setup complete")

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv
Setup complete


In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
from sklearn.model_selection import train_test_split

# drop rows missing Survived predictor data
train_data.dropna(axis = 0, subset=['Survived'], inplace = True)

# split predictor from features in train data
# test data already excludes predictor
y_train_full = train_data.Survived
X_train_full = train_data.drop(['Survived'], axis = 1)

# split train test data
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, 
                                                        train_size = 0.8, test_size = 0.2, random_state = 0)

# Select low cardinality cols as categorical cols and numeric cols, concat both
categorical_cols = [cname for cname in X_train_full.columns 
                        if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtypes == 'object']

numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtypes in ['int64','float64']]

my_cols = categorical_cols + numeric_cols

In [4]:
# Pipeline building
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# preprocessing pipeline
numerical_transformer = SimpleImputer(strategy = 'mean')
categorical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'most_frequent')),
                                            ('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

preprocessing_pipeline = ColumnTransformer(transformers =
                                           [('numerical', numerical_transformer, numeric_cols),
                                            ('categorical', categorical_transformer, categorical_cols)])

In [5]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error as mae

# Apply XGBoost model
model = XGBRegressor(random_state = 0, n_estimators = 500, learning_rate = 0.03)

# model pipeline
model_pipeline = Pipeline(steps = [('preprocessing', preprocessing_pipeline),
                                  ('model', model)])

# model fitting & predicting
model_pipeline.fit(X_train, y_train)
pred = model_pipeline.predict(X_valid)

# mean absolute error
MAE = mae(pred, y_valid)

print(MAE)

0.22488391689332488


In [6]:
# Final prediction
test_data['Survived'] = abs(model_pipeline.predict(test_data).round()).astype('int64')

# Submission

final_submission = test_data[['PassengerId','Survived']]
final_submission.to_csv('submission.csv', index=False)