In [1]:
import pandas as pd

# Load the datasets
train_data = pd.read_csv('/kaggle/input/rhns-titanic-dataset/train.csv')
test_data = pd.read_csv('/kaggle/input/rhns-titanic-dataset/test.csv')

# Display the first few rows of the training data
print(train_data.head())

# Display the first few rows of the test data
print(test_data.head())

# Check for missing values in the datasets
print(train_data.isnull().sum())
print(test_data.isnull().sum())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [2]:
from sklearn.preprocessing import LabelEncoder

# Fill missing values
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
train_data['Cabin'].fillna('U', inplace=True)

test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)
test_data['Cabin'].fillna('U', inplace=True)

# Encode categorical features
le = LabelEncoder()
train_data['Sex'] = le.fit_transform(train_data['Sex'])
test_data['Sex'] = le.transform(test_data['Sex'])

train_data['Embarked'] = le.fit_transform(train_data['Embarked'])
test_data['Embarked'] = le.transform(test_data['Embarked'])

# Feature Engineering: Extract Title from Name
train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_data['Title'] = test_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Simplify Titles
title_mapping = {
    "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Col": 7, "Major": 7,
    "Mlle": 8, "Countess": 9, "Ms": 2, "Lady": 9, "Jonkheer": 10, "Don": 10, "Dona": 10,
    "Mme": 3, "Capt": 7, "Sir": 10
}
train_data['Title'] = train_data['Title'].map(title_mapping)
train_data['Title'].fillna(0, inplace=True)

test_data['Title'] = test_data['Title'].map(title_mapping)
test_data['Title'].fillna(0, inplace=True)

# Create FamilySize feature
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

# Drop unnecessary columns
train_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Final check
print(train_data.head())
print(test_data.head())


   PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  \
0            1         0       3    1  22.0      1      0   7.2500         2   
1            2         1       1    0  38.0      1      0  71.2833         0   
2            3         1       3    0  26.0      0      0   7.9250         2   
3            4         1       1    0  35.0      1      0  53.1000         2   
4            5         0       3    1  35.0      0      0   8.0500         2   

   Title  FamilySize  
0      1           2  
1      3           2  
2      2           1  
3      3           2  
4      1           1  
   PassengerId  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  Title  \
0          892       3    1  34.5      0      0   7.8292         1      1   
1          893       3    0  47.0      1      0   7.0000         2      3   
2          894       2    1  62.0      0      0   9.6875         1      1   
3          895       3    1  27.0      0      0   8.6625         2      1

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Use the entire training set for both training and evaluation
X = train_data.drop(['Survived', 'PassengerId'], axis=1)
y = train_data['Survived']

# Initialize a RandomForestClassifier with high complexity
model = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=64)
model.fit(X, y)

# Evaluate the model on the training data
y_pred = model.predict(X)
accuracy = accuracy_score(y, y_pred)
print(f'Training Accuracy: {accuracy:.4f}')


Training Accuracy: 0.9820


In [25]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the datasets
train_data = pd.read_csv('/kaggle/input/rhns-titanic-dataset/train.csv')
test_data = pd.read_csv('/kaggle/input/rhns-titanic-dataset/test.csv')

# Fill missing values
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
train_data['Cabin'].fillna('U', inplace=True)

test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)
test_data['Cabin'].fillna('U', inplace=True)

# Encode categorical features
le = LabelEncoder()
train_data['Sex'] = le.fit_transform(train_data['Sex'])
test_data['Sex'] = le.transform(test_data['Sex'])

train_data['Embarked'] = le.fit_transform(train_data['Embarked'])
test_data['Embarked'] = le.transform(test_data['Embarked'])

# Feature Engineering: Extract Title from Name
train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_data['Title'] = test_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Simplify Titles
title_mapping = {
    "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Col": 7, "Major": 7,
    "Mlle": 8, "Countess": 9, "Ms": 2, "Lady": 9, "Jonkheer": 10, "Don": 10, "Dona": 10,
    "Mme": 3, "Capt": 7, "Sir": 10
}
train_data['Title'] = train_data['Title'].map(title_mapping)
train_data['Title'].fillna(0, inplace=True)

test_data['Title'] = test_data['Title'].map(title_mapping)
test_data['Title'].fillna(0, inplace=True)

# Create FamilySize feature
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

# Create IsAlone feature
train_data['IsAlone'] = (train_data['FamilySize'] == 1).astype(int)
test_data['IsAlone'] = (test_data['FamilySize'] == 1).astype(int)

# Create Age*Class feature
train_data['Age*Class'] = train_data['Age'] * train_data['Pclass']
test_data['Age*Class'] = test_data['Age'] * test_data['Pclass']

# Drop unnecessary columns
train_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the model and the parameter grid
model = RandomForestClassifier(random_state=64)
param_grid = {
    'n_estimators': [100, 300, 500, 800, 1200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X, y)

# Print the best parameters and score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')


Fitting 5 folds for each of 360 candidates, totalling 1800 fits
[CV] END bootstrap=True, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END bootstrap=True, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.0s
[CV] END bootstrap=True, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   1.7s
[CV] END bootstrap=True, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   1.7s
[CV] END bootstrap=True, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   2.9s
[CV] END bootstrap=True, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=1200; total time=   4.3s
[CV] END bootstrap=True, max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.4s
[CV] EN

In [None]:
# Train the model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X, y)

# Evaluate the model on the training data
y_pred = best_model.predict(X)
accuracy = accuracy_score(y, y_pred)
print(f'Training Accuracy: {accuracy:.4f}')

In [23]:
# Prepare the test data
X_test = test_data.drop('PassengerId', axis=1)

# Make predictions
test_predictions = best_model.predict(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': test_predictions
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
print('Submission file created successfully.')


Submission file created successfully.
