## Import Libraries

In [None]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

# Set default matplot figure size
pylab.rcParams['figure.figsize'] = (10.0, 8.0)

## Reading Data Set using Pandas

In [None]:
titanic_df = pd.read_csv('train.csv')

## Analysis

In [None]:
# Check the first 5 rows of the data frame
titanic_df.head()

In [None]:
# Column names
titanic_df.columns

In [None]:
# Information about the data set
titanic_df.info()

In [None]:
# Number of passengers in each class
titanic_df.groupby('Pclass')['Pclass'].count()

In [None]:
# Instead of a group by, use seaborn to plot the count of passengers in each class
fg = sns.factorplot('Pclass', data=titanic_df, kind='count', aspect=1.5)
fg.set_xlabels('Class')

In [None]:
titanic_df.groupby('Sex')['Sex'].count()

In [None]:
# Instead of a group by, use seaborn to plot the number of males and females
sns.factorplot('Sex', data=titanic_df, kind='count', aspect=1.5)

There are almost two times males as much as there were females. 

In [None]:
# Number of men and women in each of the passenger class
titanic_df.groupby(['Sex', 'Pclass'])['Sex'].count()

In [None]:
# Again use saeborn to group by Sex and class
g = sns.factorplot('Pclass', data=titanic_df, hue='Sex', kind='count', aspect=1.75)
g.set_xlabels('Class')

As shown in the figure above, there are more than two times males than females in class 3. However, in classes 1
and 2, the ratio of male to female is almost 1.

In [None]:
# Number of passengers who survived in each class grouped by sex. Also total was found for each class grouped by sex.
titanic_df.pivot_table('Survived', 'Sex', 'Pclass', aggfunc=np.sum, margins=True)

In [None]:
not_survived = titanic_df[titanic_df['Survived']==0]

In [None]:
# Factor plot of those who survived vs. who didn't
sns.factorplot('Survived', data=titanic_df, kind='count')

In [None]:
# Total number of passengers who didn't survive 
len(not_survived)

In [None]:
# Number of passengers who did not survive in each class grouped by sex.
not_survived.pivot_table('Survived', 'Sex', 'Pclass', aggfunc=len, margins=True)

In [None]:
# Passengers who survived and who didn't survive grouped by class and sex
table = pd.crosstab(index=[titanic_df.Survived,titanic_df.Pclass], columns=[titanic_df.Sex,titanic_df.Embarked])

In [None]:
table.unstack()

In [None]:
table.columns, table.index

In [None]:
# Change name of columns
table.columns.set_levels(['Female', 'Male'], level=0, inplace=True)
table.columns.set_levels(['Cherbourg','Queenstown','Southampton'], level=1, inplace=True)
table

In [None]:
print('Average and median age of passengers are %0.f and %0.f years old, respectively'%(titanic_df.Age.mean(), 
                                                                          titanic_df.Age.median()))

In [None]:
titanic_df.Age.describe()

In [None]:
# Drop missing values for the records in which age passenger is missing
age = titanic_df['Age'].dropna()

In [None]:
# Distribution of age, with an overlay of a density plot
age_dist = sns.distplot(age)
age_dist.set_title("Distribution of Passengers' Ages")

In [None]:
# Another way to plot a histogram of ages is shown below
titanic_df['Age'].hist(bins=50)

In [None]:
titanic_df['Parch'].dtype, titanic_df['SibSp'].dtype, len(titanic_df.Cabin.dropna())

In [None]:
# Create a function to define those who are children (less than 16)
def male_female_child(passenger):
    age, sex = passenger
    
    if age < 16:
        return 'child'
    else:
        return sex

In [None]:
titanic_df['person'] = titanic_df[['Age', 'Sex']].apply(male_female_child, axis=1)

In [None]:
# Lets have a look at the first 10 rows of the data frame
titanic_df[:10]

In [None]:
# Lets do a factorplot of passengers splitted into sex, children and class
sns.factorplot('Pclass', data=titanic_df, kind='count', hue='person', order=[1,2,3], 
               hue_order=['child','female','male'], aspect=2)

In [None]:
# Count number of men, women and children
titanic_df['person'].value_counts()

In [None]:
# Do the same as above, but split the passengers into either survived or not
sns.factorplot('Pclass', data=titanic_df, kind='count', hue='person', col='Survived', order=[1,2,3], 
               hue_order=['child','female','male'], aspect=1.25, size=5)

There are much more children in third class than there are in first and second class. However, one may expect that
there woould be more children in 1st and 2nd class than there are in 3rd class.

### kde plot, Distribution of Passengers' Ages

#### Grouped by Gender

In [None]:
fig = sns.FacetGrid(titanic_df, hue='Sex', aspect=4)
fig.map(sns.kdeplot, 'Age', shade=True)
oldest = titanic_df['Age'].max()
fig.set(xlim=(0,oldest))
fig.set(title='Distribution of Age Grouped by Gender')
fig.add_legend()

In [None]:
fig = sns.FacetGrid(titanic_df, hue='person', aspect=4)
fig.map(sns.kdeplot, 'Age', shade=True)
oldest = titanic_df['Age'].max()
fig.set(xlim=(0,oldest))
fig.add_legend()

#### Grouped by Class

In [None]:
fig = sns.FacetGrid(titanic_df, hue='Pclass', aspect=4)
fig.map(sns.kdeplot, 'Age', shade=True)
oldest = titanic_df['Age'].max()
fig.set(xlim=(0,oldest))
fig.set(title='Distribution of Age Grouped by Class')
fig.add_legend()

From the plot above, class 1 has a normal distribution. However, classes 2 and 3 have a skewed distribution towards
20 and 30-year old passengers.

#### What cabins did the Passengers stay in?

In [None]:
deck = titanic_df['Cabin'].dropna()
deck.head()

In [None]:
# Grab the first letter of the cabin letter
d = []
for c in deck:
    d.append(c[0])

In [None]:
d[0:10]

In [None]:
from collections import Counter
Counter(d)

In [None]:
# Now lets factorplot the cabins. First transfer the d list into a data frame. Then rename the column Cabin 
cabin_df = DataFrame(d)
cabin_df.columns=['Cabin']
sns.factorplot('Cabin', data=cabin_df, kind='count', order=['A','B','C','D','E','F','G','T'], aspect=2, 
              palette='winter_d')

In [None]:
# Drop the 'T' cabin
cabin_df = cabin_df[cabin_df['Cabin'] != 'T']

In [None]:
# Then replot the Cabins factorplot as above
sns.factorplot('Cabin', data=cabin_df, kind='count', order=['A','B','C','D','E','F','G'], aspect=2, 
              palette='Greens_d')

In [None]:
# Below is a link to the list of matplotlib colormaps
url = 'http://matplotlib.org/api/pyplot_summary.html?highlight=colormaps#matplotlib.pyplot.colormaps'
import webbrowser
webbrowser.open(url)

#### Where did the passengers come from i.e. Where did the passengers land into the ship from?

In [None]:
sns.factorplot('Embarked', data=titanic_df, kind='count', hue='Pclass', hue_order=range(1,4), aspect=2,
              order = ['C','Q','S'])

From the figure above, one may conclude that almost all of the passengers who boarded from Queenstown were in third 
class. On the other hand, many who boarded from Cherbourg were in first class. The biggest portion of passengers 
who boarded the ship came from Southampton, in which 353 passengers were in third class, 164 in second class and 
127 passengers were in first class. In such cases, one may need to look at the economic situation at these different towns at that period of time to understand why most passengers who boarded from Queenstown were in third class for example.

In [None]:
titanic_df.Embarked.value_counts()

In [None]:
# For tabulated values, use crosstab pandas method instead of the factorplot in seaborn
port = pd.crosstab(index=[titanic_df.Pclass], columns=[titanic_df.Embarked])
port.columns = [['Cherbourg','Queenstown','Southampton']]

In [None]:
port

In [None]:
port.index

In [None]:
port.columns

In [None]:
port.index=[['First','Second','Third']]

In [None]:
port

#### Who was alone and who was with parents or siblings?

In [None]:
titanic_df[['SibSp','Parch']].head()

In [None]:
# Alone dataframe i.e. the passenger has no siblings or parents
alone_df = titanic_df[(titanic_df['SibSp'] == 0) & (titanic_df['Parch']==0)]
# Add Alone column
alone_df['Alone'] = 'Alone'
# Not alone data frame i.e. the passenger has either a sibling or a parent.
not_alone_df = titanic_df[(titanic_df['SibSp'] != 0) | (titanic_df['Parch']!=0)]
not_alone_df['Alone'] = 'With family'

# Merge the above dataframes
comb = [alone_df, not_alone_df]
# Merge and sort by index
titanic_df = pd.concat(comb).sort_index()

In [None]:
[len(alone_df), len(not_alone_df)]

In [None]:
# Show the first five records of the alone data frame
alone_df.head()

In [None]:
# Show the first five rows of the not alone data frame
not_alone_df.head()

In [None]:
titanic_df.head()

In [None]:
""" Another way to perform the above
titanic_df['Alone'] = titanic_df.SibSp + titanic_df.Parch

titanic_df['Alone'].loc[titanic_df['Alone']>0] = 'With family'
titanic_df['Alone'].loc[titanic_df['Alone']==0] = 'Alone'"""

In [None]:
fg=sns.factorplot('Alone', data=titanic_df, kind='count', hue='Pclass', col='person', hue_order=range(1,4),
                 palette='Blues')
fg.set_xlabels('Status')

From the figure above, it is clear that most children traveled with family in third class. For men, most traveled alone in third class. On the other hand, the number of female passengers who traveled either with family or alone among the second and third class is comparable. However, more women traveled with family than alone in first class. 

### Factors Affecting the Surviving

In [None]:
'''Now lets look at the factors that help someone survived the sinking. We start this analysis by adding a new
cloumn to the titanic data frame. Use the Survived column to map to the new column with factors 0:no and 1:yes
using the map method'''
titanic_df['Survivor'] = titanic_df.Survived.map({0:'no', 1:'yes'})

In [None]:
titanic_df.head()

#### Class Factor

In [None]:
# Survived vs. class Grouped by gender
sns.factorplot('Pclass','Survived', hue='person', data=titanic_df, order=range(1,4), 
               hue_order = ['child','female','male'])

From the figure above, being a male or a third class reduce the chance for one to survive. 

In [None]:
sns.factorplot('Survivor', data=titanic_df, hue='Pclass', kind='count', palette='Pastel2', hue_order=range(1,4),
              col='person')

### Age Factor

In [None]:
# Linear plot of age vs. survived
sns.lmplot('Age', 'Survived', data=titanic_df)

There seems to be a general linear trend between age and the survived field. The plot shows that the older the passenger is, the less chance he/she would survive.

In [None]:
# Survived vs. Age grouped by Sex
sns.lmplot('Age', 'Survived', data=titanic_df, hue='Sex')

Older women have higher rate of survival than older men as shown in the figure above. Also, older women has higher
rate of srvival than younger women; an opposite trend to the one for the male passengers.

In [None]:
# Survived vs. Age gruped by class
sns.lmplot('Age', 'Survived', hue='Pclass', data=titanic_df, palette='winter', hue_order=range(1,4))

In all three classes, the chance to survive reduced as the passengers got older.

In [None]:
# Create a generation bin
generations = [10,20,40,60,80]
sns.lmplot('Age','Survived',hue='Pclass',data=titanic_df,x_bins=generations, hue_order=[1,2,3])

#### Deck Factor

In [None]:
titanic_df.columns

In [None]:
titanic_DF = titanic_df.dropna(subset=['Cabin'])

In [None]:
d[0:10]

In [None]:
len(titanic_DF), len(d)

In [None]:
titanic_DF['Deck'] = d

In [None]:
titanic_DF = titanic_DF[titanic_DF.Deck != 'T']

In [None]:
titanic_DF.head()

In [None]:
sns.factorplot('Deck', 'Survived', data=titanic_DF, order=['A','B','C','D','E','F','G'])

There does not seem to be any relation between deck and the survival rate as shown in the above figure!

#### Family Status Factor

In [None]:
sns.factorplot('Alone', 'Survived', data=titanic_df, palette='winter') #hue='person', 
               #hue_order=['child', 'female', 'male'])

There seems that the survival rate diminishes significantly for those who were alone. However, lets check if a
gender or age play a factor. From the figure below, one may conclude that the survival rate for women and children
are much higher than that of men, as was concluded previously and as anticipated. However, the survival rate is not
significant for either gender or for children who were with family versus who were alone. Moreover, the survival 
rate for women and children increases for those who were alone. For men, the survival rate diminishes slightly 
for those who were alone versus for those who were with family.

In [None]:
sns.factorplot('Alone', 'Survived', data=titanic_df, palette='winter', hue='person', 
               hue_order=['child', 'female', 'male'])

In [None]:
# Lets split it by class now!
sns.factorplot('Alone', 'Survived', data=titanic_df, palette='summer', hue='person', 
               hue_order=['child', 'female', 'male'], col='Pclass', col_order=[1,2,3])

### Predictive Modeling

In [None]:
import sklearn