In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [4]:
# In[0.2] Importing the Data

df_train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
df_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
# In[0.3] Handling Missing Data

# Replacing the Cabin column values to True and False (Has cabin and No cabin)
df_train_data['Cabin'] = df_train_data['Cabin'].notna().astype(int)

# Filling missing values in the Age column with the mean value
df_train_data['Age'] = df_train_data['Age'].fillna(df_train_data['Age'].mean()).astype(float)

# Dropping rows with missing values in the Embarked column
df_train_data = df_train_data.dropna(subset=['Embarked'])

df_train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Cabin        889 non-null    int64  
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(6), object(4)
memory usage: 90.3+ KB


In [6]:
# In[0.4] Transforming Features into Dummies

df_train_data = pd.get_dummies(df_train_data, columns=['Pclass',
                                                       'Sex',
                                                       'SibSp',
                                                       'Parch',
                                                       'Embarked'],
                               drop_first=True,
                               dtype=int)

In [7]:
# In[0.5] Dropping PassengerId, Name, and Ticket Columns

# List of columns to remove from the model
list_columns = ['Name', 'PassengerId', 'Ticket']

df_train_data = df_train_data.drop(list_columns, axis=1)

df_train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    889 non-null    int64  
 1   Age         889 non-null    float64
 2   Fare        889 non-null    float64
 3   Cabin       889 non-null    int64  
 4   Pclass_2    889 non-null    int64  
 5   Pclass_3    889 non-null    int64  
 6   Sex_male    889 non-null    int64  
 7   SibSp_1     889 non-null    int64  
 8   SibSp_2     889 non-null    int64  
 9   SibSp_3     889 non-null    int64  
 10  SibSp_4     889 non-null    int64  
 11  SibSp_5     889 non-null    int64  
 12  SibSp_8     889 non-null    int64  
 13  Parch_1     889 non-null    int64  
 14  Parch_2     889 non-null    int64  
 15  Parch_3     889 non-null    int64  
 16  Parch_4     889 non-null    int64  
 17  Parch_5     889 non-null    int64  
 18  Parch_6     889 non-null    int64  
 19  Embarked_Q  889 non-null    int64 

In [8]:
# In[0.6] Separating Dependent (Y) and Independent (X) Variables

# Dropping the target column 'Survived'
train_x = df_train_data.drop('Survived', axis=1)
train_x.info()

# Assigning the 'Survived' column to train_y
train_y = df_train_data['Survived']
train_y.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Age         889 non-null    float64
 1   Fare        889 non-null    float64
 2   Cabin       889 non-null    int64  
 3   Pclass_2    889 non-null    int64  
 4   Pclass_3    889 non-null    int64  
 5   Sex_male    889 non-null    int64  
 6   SibSp_1     889 non-null    int64  
 7   SibSp_2     889 non-null    int64  
 8   SibSp_3     889 non-null    int64  
 9   SibSp_4     889 non-null    int64  
 10  SibSp_5     889 non-null    int64  
 11  SibSp_8     889 non-null    int64  
 12  Parch_1     889 non-null    int64  
 13  Parch_2     889 non-null    int64  
 14  Parch_3     889 non-null    int64  
 15  Parch_4     889 non-null    int64  
 16  Parch_5     889 non-null    int64  
 17  Parch_6     889 non-null    int64  
 18  Embarked_Q  889 non-null    int64  
 19  Embarked_S  889 non-null    int64 

In [9]:
# In[0.7] Splitting the Data with random_state = 42
X_train, x_test, Y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

# Checking the size of the DataFrames
X_train.info()
Y_train.info()

x_test.info()
y_test.info()


<class 'pandas.core.frame.DataFrame'>
Index: 711 entries, 708 to 103
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Age         711 non-null    float64
 1   Fare        711 non-null    float64
 2   Cabin       711 non-null    int64  
 3   Pclass_2    711 non-null    int64  
 4   Pclass_3    711 non-null    int64  
 5   Sex_male    711 non-null    int64  
 6   SibSp_1     711 non-null    int64  
 7   SibSp_2     711 non-null    int64  
 8   SibSp_3     711 non-null    int64  
 9   SibSp_4     711 non-null    int64  
 10  SibSp_5     711 non-null    int64  
 11  SibSp_8     711 non-null    int64  
 12  Parch_1     711 non-null    int64  
 13  Parch_2     711 non-null    int64  
 14  Parch_3     711 non-null    int64  
 15  Parch_4     711 non-null    int64  
 16  Parch_5     711 non-null    int64  
 17  Parch_6     711 non-null    int64  
 18  Embarked_Q  711 non-null    int64  
 19  Embarked_S  711 non-null    int6

In [10]:
# In[0.8] Creating the Model with Random Forest

# Choosing 100 decision trees
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier.fit(X_train, Y_train)

In [11]:
# In[0.9] Making Predictions

y_pred = rf_classifier.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 76.97%


In [13]:
# In[1.0] Checking Feature Importance

importances = rf_classifier.feature_importances_

# Creating a DataFrame with feature importances
df_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Selecting features with importance greater than 0.01
df_importances_n = df_importances[df_importances['Importance'] > 0.01]
df_importances_n

# Assigning important feature names to a list
features_name = df_importances_n['Feature'].tolist()
features_name

['Age',
 'Fare',
 'Cabin',
 'Pclass_2',
 'Pclass_3',
 'Sex_male',
 'SibSp_1',
 'Parch_1',
 'Parch_2',
 'Embarked_Q',
 'Embarked_S']

In [14]:
# In[1.1] Rebuilding the Model

# Dropping the target column 'Survived'
train_x = df_train_data[features_name]
train_x.info()

# Assigning the 'Survived' column to train_y
train_y = df_train_data['Survived']
train_y.info()

X_train_n, x_test_n, Y_train_n, y_test_n = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

# Checking the size of the DataFrames
X_train_n.info()
Y_train_n.info()

x_test_n.info()
y_test_n.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Age         889 non-null    float64
 1   Fare        889 non-null    float64
 2   Cabin       889 non-null    int64  
 3   Pclass_2    889 non-null    int64  
 4   Pclass_3    889 non-null    int64  
 5   Sex_male    889 non-null    int64  
 6   SibSp_1     889 non-null    int64  
 7   Parch_1     889 non-null    int64  
 8   Parch_2     889 non-null    int64  
 9   Embarked_Q  889 non-null    int64  
 10  Embarked_S  889 non-null    int64  
dtypes: float64(2), int64(9)
memory usage: 83.3 KB
<class 'pandas.core.series.Series'>
Index: 889 entries, 0 to 890
Series name: Survived
Non-Null Count  Dtype
--------------  -----
889 non-null    int64
dtypes: int64(1)
memory usage: 13.9 KB
<class 'pandas.core.frame.DataFrame'>
Index: 711 entries, 708 to 103
Data columns (total 11 columns):
 #   Column      Non-N

In [16]:
# In[1.2] Rebuilding the Model
# Choosing 1000 decision trees
rf_classifier_n = RandomForestClassifier(n_estimators=1000,
                                         max_depth=200,
                                         min_samples_split=40,
                                         min_samples_leaf=20,
                                         max_features='sqrt',
                                         random_state=42)

In [17]:
# In[1.3] Rebuilding the Model
rf_classifier_n.fit(X_train_n, Y_train_n)

y_pred_n = rf_classifier_n.predict(x_test_n)

accuracy_n = accuracy_score(y_test_n, y_pred_n)
print(f'Accuracy: {accuracy_n * 100:.2f}%')

Accuracy: 81.46%


In [21]:
# In[1.4] Applying the Model to Unseen Data

# Importing the Data

df_test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
df_test_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [23]:
# In[1.5] Handling Missing Data

# Replacing the Cabin column values to True and False (Has cabin and No cabin)
df_test_data['Cabin'] = df_test_data['Cabin'].notna().astype(int)

# Filling missing values in the Age column with the mean value from training data
df_test_data['Age'] = df_test_data['Age'].fillna(df_test_data['Age'].mean()).astype(float)

# Dropping rows with missing values in the Embarked column
df_test_data = df_test_data.dropna(subset=['Embarked'])

df_test_data['Fare'] = df_test_data['Fare'].fillna(df_test_data['Fare'].mean()).astype(float)

df_test_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        418 non-null    int64  
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 36.0+ KB


In [24]:
# In[1.6] Transforming Features into Dummies

df_test_data = pd.get_dummies(df_test_data, columns=['Pclass',
                                                       'Sex',
                                                       'SibSp',
                                                       'Parch',
                                                       'Embarked'],
                               drop_first=True,
                               dtype=int)

In [27]:
# In[0.5] Dropping PassengerId, Name, and Ticket Columns

df_test_data_model = df_test_data[features_name]

df_test_data_model.info()

y_pred_n = rf_classifier_n.predict(df_test_data_model)

# Creating a DataFrame to save predictions
df_predictions = pd.DataFrame({
    'PassengerId': df_test_data['PassengerId'],
    'Survived': y_pred_n
})

df_predictions

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Age         418 non-null    float64
 1   Fare        418 non-null    float64
 2   Cabin       418 non-null    int64  
 3   Pclass_2    418 non-null    int64  
 4   Pclass_3    418 non-null    int64  
 5   Sex_male    418 non-null    int64  
 6   SibSp_1     418 non-null    int64  
 7   Parch_1     418 non-null    int64  
 8   Parch_2     418 non-null    int64  
 9   Embarked_Q  418 non-null    int64  
 10  Embarked_S  418 non-null    int64  
dtypes: float64(2), int64(9)
memory usage: 36.0 KB


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [28]:
# Saving to a CSV file
df_predictions.to_csv('titanic_predictions.csv', index=False)