In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [4]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df  = pd.read_csv('/kaggle/input/titanic/test.csv')

test_ids = test_df['PassengerId']

In [5]:
data_all = pd.concat([train_df.drop('Survived', axis=1), test_df], sort=False)

data_all['FamSize'] = data_all['SibSp'] + data_all['Parch'] + 1
data_all['Fare']    = data_all['Fare'].fillna(data_all['Fare'].median())
data_all['Embarked'] = data_all['Embarked'].fillna(data_all['Embarked'].mode()[0])
data_all['Age']     = data_all['Age'].fillna(data_all['Age'].median())

In [6]:
data_all.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

X_train = data_all.iloc[:len(train_df)].copy()
X_test  = data_all.iloc[len(train_df):].copy()
y_train = train_df['Survived']

In [7]:
X_train.drop('PassengerId', axis=1, inplace=True)
X_test_features = X_test.drop('PassengerId', axis=1)

num_cols = ['Age', 'Fare', 'FamSize']
cat_cols = ['Sex', 'Embarked', 'Pclass']

num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale',  StandardScaler())
])

cat_pipe = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preproc = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])


In [9]:
rf_clf = RandomForestClassifier(
    n_estimators=150,    # a few more trees for stability
    max_depth=6,         # slightly deeper for more signal
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline([
    ('prep', preproc),
    ('model', rf_clf)
])

pipe.fit(X_train, y_train)

In [10]:
y_pred = pipe.predict(X_train)
print(f"Training Accuracy: {accuracy_score(y_train, y_pred):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred))
print(f"F1 Score: {f1_score(y_train, y_pred):.4f}")

y_test_pred = pipe.predict(X_test_features)

submission_df = pd.DataFrame({
    'PassengerId': test_ids,
    'Survived': y_test_pred
})
submission_df.to_csv('submission_pipeline_rf.csv', index=False)
print("\n Submission file saved as 'submission_pipeline_rf.csv'")

Training Accuracy: 0.8732
Confusion Matrix:
 [[526  23]
 [ 90 252]]
F1 Score: 0.8169

 Submission file saved as 'submission_pipeline_rf.csv'
