In [None]:
import kagglehub
kagglehub.login()


In [None]:
titanic_path = kagglehub.competition_download('titanic')

print('Data source import complete.')


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
import matplotlib.pyplot as plt

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
train_data.head()

In [None]:
if 'Survived' not in test_data.columns:
    test_data['Survived'] = 0

In [None]:
def clean_and_prepare_data(train, test):

    combined = pd.concat([train, test], axis=0)

    combined = combined.drop(['Name', 'Ticket'], axis=1)

    combined['Age'] = combined['Age'].fillna(combined['Age'].mean())
    combined['Cabin'] = combined['Cabin'].fillna('X000')
    combined['Embarked'] = combined['Embarked'].fillna('X')
    combined['Fare'] = combined['Fare'].fillna(combined['Fare'].mean())

    combined['CabinLetter'] = combined['Cabin'].str.extract(r'([a-zA-Z]+)', expand=False)
    combined['CabinNumber'] = combined['Cabin'].str.extract(r'(\d+)', expand=False)

    combined = combined.drop('Cabin', axis=1)
    combined = pd.get_dummies(combined, columns=['CabinLetter', 'Embarked', 'Sex'], prefix=['Cabin', 'Embarked', 'Sex'])
    combined = combined.drop(['Cabin_X', 'Embarked_X'], axis=1)

    combined['CabinNumber'] = combined['CabinNumber'].fillna(0)
    combined['CabinNumber'] = pd.to_numeric(combined['CabinNumber'])
    combined['FarePerClass'] = combined['Fare'] // combined['Pclass']
    combined['ClassSexInteraction'] = combined['Pclass'] - combined['Sex_female']

    train_cleaned = combined[:len(train)]
    test_cleaned = combined[len(train):]
    test_cleaned = test_cleaned.drop('Survived', axis=1)
    return train_cleaned, test_cleaned

In [None]:
cleaned_train, cleaned_test = clean_and_prepare_data(train_data, test_data)

In [None]:
correlation = cleaned_train.corr()['Survived']

features = cleaned_train.drop('Survived', axis=1)
target = cleaned_train['Survived']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

y_train = y_train.values.reshape(-1, 1)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_reg_pred = log_reg.predict(X_val)
log_reg_accuracy = accuracy_score(y_val, log_reg_pred)
print(f"Logistic Regression Accuracy: {log_reg_accuracy}")

In [None]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

xgb_clf = XGBClassifier(enable_categorical=True)
xgb_clf.fit(X_train, y_train)
xgb_pred = xgb_clf.predict(X_val)
xgb_accuracy = accuracy_score(y_val, xgb_pred)
print(f"XGBoost Accuracy: {xgb_accuracy}")

In [None]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_val)
rf_accuracy = accuracy_score(y_val, rf_pred)
print(f"Random Forest Accuracy: {rf_accuracy}")

In [None]:
test_predictions = rf_clf.predict(cleaned_test)

In [None]:
submission = pd.DataFrame()
submission['PassengerId'] = cleaned_test['PassengerId']
submission['Survived'] = test_predictions

submission.to_csv('submission.csv', index=False)