In [1]:
# data analysis and wrangling
import numpy as np 
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# machine learning 
from sklearn.linear_model import LogisticRegression




In [2]:
# get the data
train_df = pd.read_csv('../input/titanic/train.csv')
test_df = pd.read_csv('../input/titanic/test.csv')
gender_submission_df =pd.read_csv('../input/titanic/gender_submission.csv')

In [4]:
# view train data
train_df.head(10)

In [131]:
train_df.Age.plot.hist()

In [15]:
# view test data
test_df.head(10)

In [17]:
# view submission
gender_submission_df.head(10)

## Data Descriptions

**Survival:** 0 = No, 1 = Yes

**pclass (Ticket class):** 1 = 1st, 2 = 2nd, 3 = 3rd

**sex:** Sex

**Age:** Age in years

**sibsp:** number of siblings/spouses aboard the Titanic

**parch:** number of parents/children aboard the Titanic

**ticket:** Ticket number

**fare:** Passenger fare

**cabin:** Cabin number

**embarked:** Port of Embarkation, C = Cherbourg, Q = Queenstown, S = Southampton

In [21]:
train_df.describe(include='all')

In [55]:
test_df.describe(include='all')

In [52]:
print('tain_df')
train_df.info()
print('='*50+'\n')
print('test_df')
test_df.info()
print('='*50+'\n')
print('the shape of tain_df is :', train_df.shape)
print('the shape of test_df is :', test_df.shape)
print('the shape of gender_submission_df is :', gender_submission_df.shape)

In [54]:
# missing value
print('tain_df')
display(train_df.isna().sum())
print('='*50+'\n')
print('test_df')
test_df.isna().sum()


# Analyze & Visualize data

In [240]:

print(train_df.Survived.value_counts())
fig = plt.figure(figsize=(10,5))
sns.countplot(data = train_df , y='Survived');

In [115]:
train_df[['Pclass' , 'Survived']].groupby(['Pclass'],as_index=False).mean()

In [141]:
fig = plt.figure(figsize=(10,5))
sns.countplot(data = train_df , x = 'Pclass' , hue ='Survived');

In [241]:
train_df[['Embarked' , 'Survived']].groupby(['Embarked'],as_index=False).mean()

In [238]:
fig = plt.figure(figsize=(10,5))
sns.countplot(data = train_df , x = 'Embarked' , hue ='Survived');

In [106]:
train_df[['Sex','Survived']].groupby(['Sex'], as_index=False).mean()

In [142]:
fig = plt.figure(figsize=(10,5))
sns.countplot(data = train_df , x = 'Sex' , hue ='Survived');

In [124]:
train_df[['SibSp','Survived']].groupby('SibSp',as_index=False).mean().sort_values(by = 'Survived' , ascending = False)

In [143]:
fig = plt.figure(figsize=(10,5))
sns.countplot(data = train_df , x = 'SibSp' , hue ='Survived');

In [128]:
train_df[['Parch' , 'Survived']].groupby('Parch' , as_index=False).mean().sort_values(by = 'Survived' , ascending = False)

In [249]:
fig = plt.figure(figsize=(10,5))
sns.countplot(data = train_df , x = 'Parch' , hue ='Survived')
plt.legend(title = 'Survived',loc = 'upper center');

In [243]:
train_df[['Fare', 'Survived']].groupby('Survived', as_index=False).mean()

In [247]:
grid = sns.FacetGrid(train_df , col = 'Survived' , height=5)
grid.map(plt.hist , 'Fare',bins = 20 )
grid.set_ylabels('Count');

In [250]:
grid = sns.FacetGrid(train_df , col = 'Survived' , height=5)
grid.map(plt.hist , 'Age',bins = 30 )
grid.set_ylabels('Count');

In [251]:
grid = sns.FacetGrid(train_df , col = 'Survived' , row = 'Pclass' , height = 3  , aspect=1.7)
grid.map(plt.hist , 'Age',bins = 30 )
grid.set_ylabels('Count');

In [236]:
grid = sns.FacetGrid(train_df , col = 'Survived' , row = 'Embarked' , height = 3  , aspect=1.7)
grid.map(sns.barplot, 'Sex', 'Fare' , palette='muted' ,order = ['male' , 'female'] )
grid.add_legend();

In [219]:
grid = sns.FacetGrid(train_df, col='Embarked', height = 3, aspect=1.7)
grid.map(sns.violinplot, 'Pclass', 'Survived', 'Sex', palette='muted' , order = [1,2,3] ,  hue_order =['male' , 'female'] )
grid.add_legend();


# Wrangle data

In [253]:
train_df.head(1)

In [259]:
train_df2 = train_df.drop(['Ticket', 'Cabin' , 'Name' ,'PassengerId','Age'], axis=1)
test_df2 = test_df.drop(['Ticket', 'Cabin' , 'Name' , 'PassengerId','Age'], axis=1)

In [260]:
train_df2.head(1)