In [None]:
import re
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import tensorflow as tf
from sklearn.model_selection import train_test_split


pd.set_option('display.max_columns', None)

## Data Analysis

In [None]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')
passenger_ids = test_df['PassengerId']

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df['Survived'].value_counts()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.describe()

In [None]:
test_df.describe()

## Visualization

In [None]:
def hist_survived_vs_feature(feature, df=train_df, labels={}):

    survived_mapping = df['Survived'].map({0: 'Dead', 1: 'Survived'})

    fig = px.histogram(df, x=survived_mapping, width=800, color=feature, labels=labels)
    fig.update_layout(
        bargap=0.2,
        xaxis_title_text='Survived',
        yaxis_title_text='Survived count'
    )
    
    return fig

hist_survived_vs_feature('Pclass')

In [None]:
hist_survived_vs_feature('Sex')

In [None]:
fig = px.histogram(train_df, x='Age', color='Survived', barmode='overlay')
fig

In [None]:
hist_survived_vs_feature('SibSp')

In [None]:
hist_survived_vs_feature('Parch')

In [None]:
fig = px.histogram(train_df, x='Fare', color='Survived', barmode='overlay')
fig

In [None]:
df = train_df[train_df['Cabin'].notnull()]
cabin_initials = df['Cabin'].map(lambda x: x[0])

hist_survived_vs_feature(cabin_initials, df=df, labels={'color': 'cabin'})

In [None]:
df = train_df[train_df['Embarked'].notnull()]
hist_survived_vs_feature('Embarked', df=df)

## Feature Engineering

In [None]:
train_df.columns

In [None]:
dataset = [train_df, test_df]

# Preprocessing feature 'Name'

for df in dataset:
    df['Title'] = df['Name'].map(lambda x: re.search(r' ([A-Za-z]+)\.', x).group().strip().replace('.', ''))

train_df['Title'].value_counts().index

In [None]:
test_df['Title'].value_counts().index

In [None]:
title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Dr': 5, 'Rev': 5, 'Col': 5,
                 'Major': 5, 'Mlle': 5, 'Ms': 5, 'Countess': 5, 'Lady': 5, 'Capt': 5,
                 'Jonkheer': 5, 'Don': 5, 'Sir': 5, 'Mme': 5}

for df in dataset:
    df['Title'] = df['Title'].map(title_mapping)

train_df.head()

In [None]:
def concat_dummies(feature_name):
    global dataset
    
    new_train_df = pd.concat([train_df, pd.get_dummies(train_df[feature_name], prefix=feature_name)], axis=1)
    new_test_df = pd.concat([test_df, pd.get_dummies(test_df[feature_name], prefix=feature_name)], axis=1)
    dataset = [new_train_df, new_test_df]
    
    return new_train_df, new_test_df

In [None]:
train_df, test_df = concat_dummies('Title')

In [None]:
train_df.head()

In [None]:
# Preprocessing feature 'Sex'

sex_mapping = {'male': 0, 'female': 1}

for df in dataset:
    df['Sex'] = df['Sex'].map(sex_mapping)
    
train_df.head()

0 - 5:

In [None]:
# Preprocessing feature 'Age'

age_bins = [0, 5.99, 11.9, 17.9, 25.9, 47.9, 61.9, 80]
age_labels = [i for i in range(1, 8)]

for df in dataset:
    df['Age'] = df['Age'].fillna(df.groupby('Title')['Age'].transform('median'))
    df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)

train_df.head()

In [None]:
train_df, test_df = concat_dummies('AgeGroup')

train_df.head()

In [None]:
# Preprocessing features 'Parch' and 'SipSp'

def scale_feature(feature):
    result = []
    
    for df in dataset:
        feature_val = df[feature]
        max_val = feature_val.max()
        min_val = feature_val.min()
        scaled_feature = (feature_val - min_val) / (max_val - min_val)
        result.append(scaled_feature)
        
    return result

train_df['SibSp'], test_df['SibSp'] = scale_feature('SibSp')
train_df['Parch'], test_df['Parch'] = scale_feature('Parch')

In [None]:
train_df.head()

In [None]:
# Preprocessing feature 'Fare'

test_df['Fare'] = test_df['Fare'].fillna(test_df.groupby('Pclass')['Fare'].transform('median'))
train_df['Fare'], test_df['Fare'] = scale_feature('Fare')

train_df.head()

In [None]:
# Preprocessing feature 'Embarked'

# Visualizing the count of passenger's embarkment across different classes using bar chart


df = train_df[train_df['Embarked'].notnull()]
class_count = df.groupby(['Pclass', 'Embarked'])['Embarked'].count()
C_count = class_count.loc[([1, 2, 3], 'C')]
Q_count = class_count.loc[([1, 2, 3], 'Q')]
S_count = class_count.loc[([1, 2, 3], 'S')]

p_class = [1, 2, 3]
fig = go.Figure()
fig.add_trace(go.Bar(x=p_class, y=C_count.tolist(), name='C'))
fig.add_trace(go.Bar(x=p_class, y=Q_count.tolist(), name='Q'))
fig.add_trace(go.Bar(x=p_class, y=S_count.tolist(), name='S'))
fig.update_layout(
    barmode='stack',
    xaxis_title_text='Passenger class',
    yaxis_title_text='Embarked station count'
)
fig.show()

# Getting the same figure using histogram

fig = px.histogram(df, x='Pclass', color='Embarked')
fig.update_layout(
    bargap=0.2,
    xaxis_title_text='Passenger class',
    yaxis_title_text='Embarked station count'
)

In [None]:
train_df['Embarked'] = train_df['Embarked'].fillna('S')
train_df, test_df = concat_dummies('Embarked')

train_df.head()

In [None]:
# Dropping unwanted columns
 
train_df = train_df.drop(['PassengerId', 'Name', 'Age', 'Ticket', 'Cabin', 'Embarked',
                          'Title', 'AgeGroup'], axis=1)
test_df = test_df.drop(['PassengerId', 'Name', 'Age', 'Ticket', 'Cabin', 'Embarked',
                          'Title', 'AgeGroup'], axis=1)

In [None]:
train_df.head()

In [None]:
test_df.head()

## Modelling

In [None]:
X = train_df.iloc[:, 1:].values
y = train_df['Survived'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
X_test = test_df.values

In [None]:
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of X_test: {X_val.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of y_test: {y_val.shape}')

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(10, input_shape=[20], activation='relu'))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(20, activation='relu'))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
          
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(
    x=X_train,
    y=y_train,
    epochs=10,
    batch_size=1,
    validation_data=(X_val, y_val)
)

In [None]:
prediction = model.predict(X_test)
rounded_prediction = np.where(prediction >= 0.5, 1, 0).flatten()

submission_df = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': rounded_prediction
})

submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)