In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Checking the Training Set

In [None]:
df_train = pd.read_csv("/kaggle/input/titanic/train.csv")
df_train[df_train["PassengerId"] == 892]

In [None]:
df_train["Age"] = df_train["Age"].fillna(df_train["Age"].mean())
df_train["Embarked"] = df_train["Embarked"].fillna(df_train["Embarked"].mode()[0])

In [None]:
df_train["Embarked"].value_counts()

#### Understanding the shape

In [None]:
df_train.shape
# df_train.describe()

In [None]:
df_train.groupby('Sex')['Survived'].count()

In [None]:
gpr_ = df_train[df_train['Survived'] == 1].groupby('Sex')['Survived'].count()
gpr_


In [None]:
df_train.head()

#### Dropping Columns

In [None]:
df_train = df_train.drop(columns = ["PassengerId", "Parch", "Cabin", "SibSp"], axis = 1)
df_train

##### Checking for Null values

#### Importing necessary Libraries

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score
from sklearn import model_selection

#### Preprocessing


In [None]:
cat_var = ["Sex", "Embarked"]
encoder = OneHotEncoder(handle_unknown= 'ignore',sparse_output= False)
encoded_data = encoder.fit_transform(df_train[cat_var])
encoded_df = pd.DataFrame(encoded_data, columns = encoder.get_feature_names_out(cat_var), index = df_train.index)
df_train = pd.concat([df_train.drop(columns = cat_var), encoded_df], axis= 1)

#### Standardizing the numeric columns

In [None]:
scaler = StandardScaler()
df_train[["Age", "Fare"]] = scaler.fit_transform(df_train[["Age", "Fare"]])


#### Initializing Several models in a List 

In [None]:
models = [
        ("Logistic Regression:", LogisticRegression(class_weight= 'balanced')),
        ("Trees:", DecisionTreeClassifier()),
        ("SVM:", SVC(kernel= "linear")),
        ("RF:", RandomForestClassifier(class_weight= 'balanced', n_estimators = 100))
    ]

In [None]:
df_train["Survived"].value_counts()

In [None]:
x = df_train.drop(columns = ["Name", "Survived", "Ticket"])
y = df_train["Survived"]
# model = RandomForestClassifier()
results = []
names = []
for name, model in models:
    print("Evaluating model, ", name)
    kfold = model_selection.KFold(n_splits = 5)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state = 42)
    cv_res = model_selection.cross_val_score(model, x_train, y_train, cv = kfold, scoring = 'accuracy')
    results.append(cv_res)
    names.append(name)
    print(f"Cross Val accuracy: {cv_res.mean():.2f}  {cv_res.std():.2f}")
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    accuracy = accuracy_score(y_test, preds)
print("The Test set acccuracy of the model: ", accuracy)


In [None]:
# x_train.value_counts()
# RandomForest model performs best
kfold = model_selection.KFold(n_splits = 10)
rf  = RandomForestClassifier(n_estimators= 100, class_weight = 'balanced', random_state= 42)
cv_res = model_selection.cross_val_score(rf, x_train, y_train, cv = kfold, scoring = 'accuracy')
rf.fit(x_train, y_train)
preds = rf.predict(x_test)
train = rf.score(x_train, y_train)
print(f"The score for the train set: {train:.2f}")
print(f"The accuracy for the validation set: {accuracy_score(y_test, preds):.2f}")
# preds

#### Handling the Validation Set

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

In [None]:
test_data["Age"] = test_data["Age"].fillna(test_data["Age"].mean())
test_data["Embarked"] = test_data["Embarked"].fillna(test_data["Embarked"].mode()[0])
test_data.isnull().sum()
test_data["Fare"] = test_data["Fare"].fillna(test_data["Fare"].mean())
# test_data

In [None]:
cat_var = ["Sex", "Embarked"]
encoder = OneHotEncoder(handle_unknown= 'ignore',sparse_output= False)
encoded_data = encoder.fit_transform(test_data[cat_var])
encoded_df = pd.DataFrame(encoded_data, columns = encoder.get_feature_names_out(cat_var), index = test_data.index)
test_data = pd.concat([test_data.drop(columns = cat_var), encoded_df], axis= 1)
test_data

In [None]:
features = ["Pclass","Age", "Fare","Sex_female","Sex_male","Embarked_C","Embarked_Q", "Embarked_S"]
rf_pred = rf.predict(test_data[features])
output_data = pd.DataFrame(
    {
        "PassengerId" : test_data.PassengerId,
        "Survived" : rf_pred
    }
)
output_data.to_csv("Submission.csv", index = False)
print("Submitted successfully")