In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Competition Overview: The competition is simple: use machine learning to create a model that predicts which passengers survived the Titanic shipwreck.
About my project:
-We observed that various features in this project had missing values, therefore we handled the missing data by imputing the mean age, among other techniques. Preprocessing data is an essential step. Using one-hot encoding, we transformed categorical variables (such "Sex" and "Embarked") into numerical format. 
-We selected 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Embarked' as the subset of features that would be most useful to predict survival. 
-The machine learning model that we employed was the Random Forest Classifier. For classification tasks, this approach is both robust and adaptable. To maximize the performance of the model, hyperparameter adjustment is essential, and there are numerous more approaches to investigate.
-We evaluated the model's performance using accuracy as a metric. Achieving 100% accuracy is often a sign of overfitting, where the model fits the training data too closely and may not generalize well to new data. It's important to strike a balance between model complexity and generalization.

# **Step 1: Import Necessary Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Load the training and testing data
train_data = pd.read_csv('/kaggle/input/titanic/train.csv' )
test_data =  pd.read_csv('/kaggle/input/titanic/test.csv' )

**Step 2: Data Exploration and Preprocessing**

In [None]:
# Explore the data
print(train_data.head())

In [None]:
print(train_data.info())

**Step 3: Cleaning and Preprocessing the data**

In [None]:
# Handle missing data
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)

In [None]:
# Convert categorical variables to numeric
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'])

In [None]:
# Feature selection
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']

**Step 4: Splitting the dataset into training and test sets**

In [None]:
# Split the data into training and validation sets
X = train_data[features]
y = train_data['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

**Step 5: Training the Model**

In [None]:
# Initialize the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the validation data
y_pred = model.predict(X_val)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy: {:.2f}%".format(accuracy * 100))


**Step 6: Model Testing and Prediction**

In [None]:
# Preprocess the test data in the same way as the training data
test_data['Age'].fillna(test_data['Age'].mean(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace=True)

In [None]:
# Convert categorical variables to numeric for the test data
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked'])

In [None]:
# Select features for the test data
X_test = test_data[features]

In [None]:
# Make predictions on the test data
test_predictions = model.predict(X_test)


**Step 7: Prepare the Submission File**

In [None]:
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': test_predictions})
submission.to_csv('submission.csv', index=False)