In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
trainData = pd.read_csv("/kaggle/input/titanic/train.csv")
testData = pd.read_csv("/kaggle/input/titanic/test.csv")

* Read Train & Test Data Files

In [None]:
trainData.describe()

In [None]:
trainData.head()

# **1. Visualizing, Aanalyzing and Investigating the Data**

* As a Conclusion from the above 5 diagrams, we find that:
    1. Survivals were way less than Drawns
    2. Females were way more than the Males Survivors
    3. Merging 1 and 3 for better visualization
    4. Pclass 3 had the least survivals
    5. Merging 1 and 5 for better visualization
    

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.subplots(figsize=(5, 5))
sns.countplot(x='Survived', data=trainData)
plt.title('Survivals Diagram')
plt.show()

In [None]:
plt.subplots(figsize=(5, 5))
sns.barplot(x='Sex', y='Survived', data=trainData, ci=None)
plt.title('Survivals based on Sex Diagram')
plt.show()

In [None]:
grouped = trainData.groupby('Survived')['Sex']
grouped.value_counts().unstack().plot(kind="bar",stacked="True")

In [None]:
plt.subplots(figsize=(5, 5))
sns.barplot(x='Pclass', y='Survived', data=trainData, ci=None)
plt.title('Survivals based on Pclass Diagram')
plt.show()

In [None]:
grouped = trainData.groupby('Survived')['Pclass']
grouped.value_counts().unstack().plot(kind="bar",stacked="True")

# **2. Preprocessing the Data**

* Check for Null values in train and test data.

In [None]:
trainData.isnull().sum()

In [None]:
testData.isnull().sum()

**Handling NaN Column Cells**
1. First: Drop Cabin Column as most of the values are lacking from trainData and testData (around 76% of values NaN).
2. Second: Drop the rows having NaN cells in Embarked Column from trainData as they are only 2, not so effective.
3. Third: Deal with the Fare empty cell in the testData.
4. Fourth: Deal with the missing "Age" Column.

In [None]:
# Dropping Cabin Column from Train and Test Data
trainData.drop(columns=['Cabin'], inplace=True)
testData.drop(columns=['Cabin'], inplace=True)

# Dropping NaN Rows of Embarked from Train Data
trainData.dropna(subset=['Embarked'], inplace=True)

In [None]:

trainData.describe()

1. Fill the empty Fare found in the Test Data by grouping them according to the Pclass as each has a different fare.
2. Getting the median of each class.
3. Setting the NaN Value with the median values calculated.

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.histplot(testData.Fare)

* Visualizing the Diagram above, we find that using the Mode of values could be a good option to replace NaN Values of Fare Column.

In [None]:
testData['Fare'] = testData['Fare'].fillna(testData['Fare'].mode()[0])

* In the below cells, we check the NaN values after removing those in the Cabin, Embarked and Fare Columns.

In [None]:
trainData.isnull().sum()

In [None]:
testData.isnull().sum()

**Removing the NaN Values in Age Columns needed to be as accurate as possible as I believe it is an important feature to be considered.**
* First: Plot the Histogram for the ages to decide which is the best Interpolation to apply

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.histplot(trainData.Age)

* Visualizing the Diagram above,
* Since our model is positively skewed, Therefore, applying Interpolation using Mode or Median would possibly give the best values for the NaN Values.
* In my model, I used Mode Imutation for the missing values.

In [None]:
trainData['Age'] = trainData['Age'].fillna(trainData['Age'].mode()[0])
testData['Age'] = testData['Age'].fillna(testData['Age'].mode()[0])

* Testing our preprocessing on the Train Data

In [None]:
trainData.isnull().sum()

In [None]:
testData.isnull().sum()

# 3. Model Building & Evaluation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
trainData.columns

**Using get_dummies to convert the Categorical Values of "Sex" Column to a 0's and 1's.**

In [None]:
y = trainData["Survived"]
# Create X
features = ['Pclass', 'Sex','Fare', 'Age']
X = pd.get_dummies(trainData[features])
test_X = pd.get_dummies(testData[features])

**Random Initialization for the Model**

In [None]:
randomForest_model = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=50)

# Split into testing and training data. (Random_state = 20 for shuffling)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=20)

# Fit the Model
randomForest_model.fit(X_train, y_train)


# Model Accuracy
randomForest_model.score(X_test, y_test)

**Using the Grid Search Cross Validation Technique to Improve the Model.**

In [None]:
from sklearn.model_selection import GridSearchCV

**Hyperparameters Tuning**
* Setting Parameters for the Grid Search to Try.
* Applying Cross Validation to avoid overfitting.

In [None]:
param_grid = {
                 'n_estimators': [50, 100, 150, 200],
                 'max_depth': [2, 5, 7, 9,11]
             }


In [None]:
from sklearn.model_selection import GridSearchCV
# 10 Folds for the Cross Validation
grid_clf = GridSearchCV(randomForest_model, param_grid, cv=5)
validated_model = grid_clf.fit(X, y)

In [None]:
validated_model. best_params_

In [None]:
validated_model.best_estimator_

In [None]:
validated_model.best_score_

In [None]:
# Model Prediction
model_predictions = validated_model.predict(test_X)

In [None]:
submission_output = pd.DataFrame({'PassengerId': testData.PassengerId, 'Survived': model_predictions})
submission_output.to_csv('my_final_submission.csv', index=False)
print("Submission Output File saved Successfully")