In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Loading the datasets
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
# Combining both train and test datasets for preprocessing
test['Survived'] = np.nan  # Adding Survived column in the test set for uniformity
combined = pd.concat([train, test], sort=False)

# Data Preprocessing

# Filling missing 'Embarked' values with the mode
combined['Embarked'].fillna(combined['Embarked'].mode()[0], inplace=True)

# Filling missing 'Fare' values with the median
combined['Fare'].fillna(combined['Fare'].median(), inplace=True)

# Extracting titles from names (e.g., Mr, Mrs, Miss, etc.)
combined['Title'] = combined['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())

# Simplifying titles
combined['Title'] = combined['Title'].replace(['Mlle', 'Ms'], 'Miss')
combined['Title'] = combined['Title'].replace(['Mme', 'Lady', 'Countess', 'Dona'], 'Mrs')
combined['Title'] = combined['Title'].replace(['Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'], 'Mr')

# Filling missing 'Age' values based on the median age of corresponding Title groups
combined['Age'] = combined.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))

In [None]:
# Dropping irrelevant columns
combined.drop(['PassengerId', 'Ticket', 'Cabin', 'Name'], axis=1, inplace=True)

# Encoding categorical variables
label_encoder = LabelEncoder()
combined['Sex'] = label_encoder.fit_transform(combined['Sex'])
combined['Embarked'] = label_encoder.fit_transform(combined['Embarked'])
combined['Title'] = label_encoder.fit_transform(combined['Title'])


In [None]:
# Separate the datasets back into train and test
train_cleaned = combined[combined['Survived'].notna()]
test_cleaned = combined[combined['Survived'].isna()].drop('Survived', axis=1)

# Define features and target variable
X_train = train_cleaned.drop('Survived', axis=1)
y_train = train_cleaned['Survived']
X_test = test_cleaned

In [None]:
# Random Forest with GridSearchCV for hyperparameter tuning

# Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)


In [None]:
# Get the best estimator
best_rf = grid_search.best_estimator_

# Predict on the test set
test_predictions = best_rf.predict(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_predictions.astype(int)
})

# Save the submission file as CSV
submission.to_csv('submission.csv', index=False)