In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
full_data = [train, test]

In [None]:
train.describe(include='all')

# Data Analysis
<h2>Pclass</h2>

In [None]:
train.loc[:,["Pclass","Survived"]].groupby("Pclass").mean()

<p>Most survival chances for Pclass of 1  which means upper class is expected to survive than others</p>
<h2>Sex</h2>

In [None]:
train.loc[:, ["Sex", "Survived"]].groupby("Sex").mean()

<p2>Females to be expected to survive than Males as per above analysis</p2>
<h2>Embarked</h2>

In [None]:
train.loc[:, ["Embarked", "Survived"]].groupby("Embarked").mean()

<p>Those embarked from cherbourg are more expected to be able to survive</p>
<h2>SibSp and Parch</h2>
<p>Creating another feature Family Members by combining two features & add 1 to count concern passenger</p>

In [None]:
for dataset in full_data:
    dataset["FamilyMembers"] = dataset["SibSp"] + dataset["Parch"] + 1

In [None]:
train.loc[:, ["FamilyMembers", "Survived"]].groupby("FamilyMembers").mean()

<h2>Name</h2>
<p>Preprocessing name column to get title from them</p>

In [None]:
def get_title(name):
    title = re.search(" ([A-Za-z]+)\.", name)
    if title:
        return title.group(1)
    else:
        return ""

In [None]:
for dataset in full_data:
    dataset["Title"] = dataset["Name"].apply(get_title)

In [None]:
pd.crosstab(train["Title"], train["Sex"])

In [None]:
for dataset in full_data:
    dataset["Title"] = dataset["Title"].replace(["Sir", "Rev", "Capt", "Col", "Countess", "Lady", "Don", "Dr", "Major", 
                                                 "Jonkheer"], "Rare")
    dataset["Title"] = dataset["Title"].replace(["Mlle", "Mme"], "Miss")
    dataset["Title"] = dataset["Title"].replace("Ms", "Mrs")

In [None]:
pd.crosstab(train["Title"], train["Sex"])

In [None]:
train.loc[:, ["Title", "Survived"]].groupby("Title").mean()

<h2>Age</h2>

In [None]:
(train["Age"].isnull().sum() + test["Age"].isnull().sum())/(len(train)+len(test))

<p>Since, 20% of Age records are missing from the dataset. Thus we forst have to fill those values</p>

In [None]:
for dataset in full_data:
    age_std = dataset["Age"].std()
    age_mean = dataset["Age"].mean()
    null_count = dataset["Age"].isnull().sum()
    
    age_random_list = np.random.randint(age_mean - age_std, age_mean + age_std, size=null_count)
    dataset.loc[dataset["Age"].isnull(), "Age"] = age_random_list

In [None]:
for dataset in full_data:
    dataset["CategoricalAge"] = pd.qcut(dataset["Age"], 5)

In [None]:
train.loc[:, ["CategoricalAge", "Survived"]].groupby("CategoricalAge").mean()

<p>Checking Data Skewness</p>

In [None]:
for dataset in full_data:
    print(dataset["Fare"].mean(), dataset["Fare"].median())

<P>Since, Mean is greater than median which shows data is right skewed. That's why we should use median to impute data</P>

In [None]:
for dataset in full_data:
    dataset.loc[dataset["Fare"].isnull(), "Fare"] = dataset["Fare"].median()
    dataset["CategoricalFare"] = pd.qcut(dataset["Fare"], 5)

In [None]:
train.loc[:,["CategoricalFare", "Survived"]].groupby(["CategoricalFare"], as_index=False).mean()

<h2>Data Cleaning</h2>
<p>Converting categorical data to numerical value</p>

In [None]:
train.dtypes

In [None]:
for dataset in full_data:
    dataset["Sex"] = dataset["Sex"].astype("category")
    cat_columns = dataset.select_dtypes(["category"]).columns
    dataset[cat_columns] = dataset[cat_columns].apply(lambda x: x.cat.codes)

In [None]:
train["Sex"]

In [None]:
train = pd.get_dummies(train, columns=["Embarked", "Title"], drop_first=True)

In [None]:
test = pd.get_dummies(test, columns=["Embarked"], drop_first=True)

In [None]:
test = pd.get_dummies(test, columns=["Title"], drop_first=True)

In [None]:
test.head()

<h2>Feature Selection</h2>

In [None]:
full_data = [train, test]

In [None]:
train.columns

In [None]:
drop_cols = ["PassengerId", "Name", "SibSp", "Parch", "Ticket", "Cabin", "CategoricalAge", "CategoricalFare"]

In [None]:
train.drop(drop_cols, inplace=True, axis=1)
test.drop(drop_cols, inplace=True, axis=1)
test.drop("Title_Master", inplace=True, axis=1)

In [None]:
train.head()

In [None]:
test.head()

In [None]:

len(train.columns), len(test.columns)