Dataset link - https://www.kaggle.com/c/titanic/data

In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Modules

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Loading Dataset

In [5]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [6]:
train.head()

## Overview of the training data

In [7]:
# Statistical analysis

train.describe()

In [8]:
# Datatype info
train.info()

# Inference - We have null values in age and cabin columns

## Exploratory data analysis

#### Categorical Attributes

In [9]:
sns.countplot(train['Survived'])

# Inference - (Unbalanced) Number of people who died is more

In [10]:
sns.countplot(train['Pclass'])

# Inference - (Unbalanced) Number of people in 3rd class is way more

In [11]:
sns.countplot(train['Sex'])

# Inference - (Unbalanced) Number of males is more

In [12]:
sns.countplot(train['SibSp'])

# Inference - (Unbalanced) Single person are more (No sibling or spouse)

In [13]:
sns.countplot(train['Parch'])

# Inference - (Unbalanced) Single person are more (No parents or children)

In [14]:
sns.countplot(train['Embarked'])

# Inference - More people embarked from S = Southampton port

#### Numerical Attributes

In [15]:
sns.distplot(train['Age'])

# Inference - (Bell shaped curve) there are more people of age between 20-35

In [16]:
sns.distplot(train['Fare'])

# Inference - As number of people in 3rd class is more, fare in range of $10-50 has 
# high density 

In [17]:
class_fare = train.pivot_table(index='Pclass', values='Fare')
class_fare.plot(kind='bar')
plt.xlabel('Pclass')
plt.ylabel('Avg. Fare')
plt.xticks(rotation=0)
plt.show()

In [18]:
class_fare = train.pivot_table(index=['Pclass'], values=['Fare'], aggfunc=np.sum)
class_fare.plot(kind='bar')
plt.xlabel('Pclass')
plt.ylabel('Total Fare')
plt.xticks(rotation=0)
plt.show()

In [19]:
sns.barplot(data=train, x='Pclass', y='Fare', hue='Survived')

In [20]:
sns.barplot(data=train, x='Survived', y='Fare', hue='Pclass')

## Data Preprocessing

In [21]:
train_len = len(train)
# Combining the two dataframes
df = pd.concat([train, test], axis=0)      # concat row wise
df = df.reset_index(drop=True)
df.head()

In [22]:
df.tail()

In [25]:
# Find the null values
df.isnull().sum()

# Column Cabin has large number of null values so we will drop that column
# We will fill other null values 

In [26]:
df = df.drop(columns=['Cabin'], axis=1)

In [27]:
# fill missing values using mean of the numerical column

df['Age'] = df['Age'].fillna(df['Age'].mean())

In [28]:
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

In [32]:
# fill missing values using mode of the categorical column

df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

## Log Transformation for uniform data distribution

In [38]:
sns.distplot(df['Fare'])

In [39]:
df['Fare'] = np.log(df['Fare']+1)

In [40]:
sns.distplot(df['Fare'])

## Correlation Matrix

In [42]:
corr = df.corr()
plt.figure(figsize=(15,9))
sns.heatmap(corr, annot=True, cmap='coolwarm')

# As seen previously also, Pclass ans Fare have high correlation

In [43]:
df.head()

In [44]:
# Drop unnecessary columns

df = df.drop(columns=['Name','Ticket'], axis=1)

In [45]:
df.head()

## Label Encoding

In [46]:
from sklearn.preprocessing import LabelEncoder
cols = ['Sex','Embarked']
le = LabelEncoder()

for col in cols:
    df[col] = le.fit_transform(df[col])
    
df.head()

## Train-Test Split

In [50]:
train = df.iloc[:train_len, :]
test = df.iloc[train_len:, :]

In [51]:
train.head()

In [54]:
test.head()

In [55]:
# Input Split

x = train.drop(columns=['Survived', 'PassengerId'], axis=1)
y = train['Survived']

In [56]:
x.head()

## Model Training

In [57]:
from sklearn.model_selection import train_test_split, cross_val_score

def classify(model):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25, random_state=42)
    model.fit(x_train, y_train)
    accuracy = model.score(x_test, y_test)
    print("Accuracy: ", accuracy)
    
    score = cross_val_score(model, x, y, cv=5)
    print("CV Score: ", np.mean(score))

### Logistic Regression

In [58]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model)

### Decision Tree

In [60]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
classify(model)

### Random Forest

In [61]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
classify(model)

### Extra Tree Classifier

In [62]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
classify(model)