In [1]:
import pandas as pd

# Load the datasets
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
gender_submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

# Preview the first few rows of each dataset
train_preview = train_data.head()
test_preview = test_data.head()
gender_submission_preview = gender_submission.head()

train_preview, test_preview, gender_submission_preview


(   PassengerId  Survived  Pclass  \
 0            1         0       3   
 1            2         1       1   
 2            3         1       3   
 3            4         1       1   
 4            5         0       3   
 
                                                 Name     Sex   Age  SibSp  \
 0                            Braund, Mr. Owen Harris    male  22.0      1   
 1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
 2                             Heikkinen, Miss. Laina  female  26.0      0   
 3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
 4                           Allen, Mr. William Henry    male  35.0      0   
 
    Parch            Ticket     Fare Cabin Embarked  
 0      0         A/5 21171   7.2500   NaN        S  
 1      0          PC 17599  71.2833   C85        C  
 2      0  STON/O2. 3101282   7.9250   NaN        S  
 3      0            113803  53.1000  C123        S  
 4      0            373450   8.0500

In [2]:
# Check for missing values in the train and test datasets
train_data.isnull().sum(), test_data.isnull().sum()


(PassengerId      0
 Survived         0
 Pclass           0
 Name             0
 Sex              0
 Age            177
 SibSp            0
 Parch            0
 Ticket           0
 Fare             0
 Cabin          687
 Embarked         2
 dtype: int64,
 PassengerId      0
 Pclass           0
 Name             0
 Sex              0
 Age             86
 SibSp            0
 Parch            0
 Ticket           0
 Fare             1
 Cabin          327
 Embarked         0
 dtype: int64)

In [3]:
# Fill missing Age with median in both train and test datasets
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)

# Fill missing Embarked values with the most frequent value in train dataset
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)

# Fill missing Fare value in the test dataset with the median Fare
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

# Drop the Cabin column as it has too many missing values
train_data.drop(columns=['Cabin'], inplace=True)
test_data.drop(columns=['Cabin'], inplace=True)

# Verify that missing values are handled
train_data.isnull().sum(), test_data.isnull().sum()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate

(PassengerId    0
 Survived       0
 Pclass         0
 Name           0
 Sex            0
 Age            0
 SibSp          0
 Parch          0
 Ticket         0
 Fare           0
 Embarked       0
 dtype: int64,
 PassengerId    0
 Pclass         0
 Name           0
 Sex            0
 Age            0
 SibSp          0
 Parch          0
 Ticket         0
 Fare           0
 Embarked       0
 dtype: int64)

In [4]:
# Convert categorical features 'Sex' and 'Embarked' into numerical values using one-hot encoding
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked'], drop_first=True)

# Drop 'Name' and 'Ticket' columns, as they don't directly contribute to the prediction
train_data.drop(columns=['Name', 'Ticket'], inplace=True)
test_data.drop(columns=['Name', 'Ticket'], inplace=True)

# Check the structure of the cleaned datasets
train_data.head(), test_data.head()


(   PassengerId  Survived  Pclass   Age  SibSp  Parch     Fare  Sex_male  \
 0            1         0       3  22.0      1      0   7.2500      True   
 1            2         1       1  38.0      1      0  71.2833     False   
 2            3         1       3  26.0      0      0   7.9250     False   
 3            4         1       1  35.0      1      0  53.1000     False   
 4            5         0       3  35.0      0      0   8.0500      True   
 
    Embarked_Q  Embarked_S  
 0       False        True  
 1       False       False  
 2       False        True  
 3       False        True  
 4       False        True  ,
    PassengerId  Pclass   Age  SibSp  Parch     Fare  Sex_male  Embarked_Q  \
 0          892       3  34.5      0      0   7.8292      True        True   
 1          893       3  47.0      1      0   7.0000     False       False   
 2          894       2  62.0      0      0   9.6875      True        True   
 3          895       3  27.0      0      0   8.6625   

In [5]:
# Split the train data into features and target
X_train = train_data.drop(columns=['Survived', 'PassengerId'])
y_train = train_data['Survived']

# The test set for prediction
X_test = test_data.drop(columns=['PassengerId'])


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split train set further for validation
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_part, y_train_part)

# Predict on validation set and test set
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation accuracy: {val_accuracy}')

# Make predictions on the test set
y_test_pred = model.predict(X_test)


Validation accuracy: 0.8212290502793296


In [7]:
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': y_test_pred})
submission.to_csv('titanic_predictions.csv', index=False)
