# Imports 

In [13]:
# Basic pandas and numpy
import pandas as pd
import numpy as np
 
# Basic visualization tools
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
 
# Pandas defaults
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
 
# Make jupyter bigger
from IPython.core.display import display, HTML
display(HTML('<style>.container { width:100% !important; }</style>'))

In [14]:
categorical = pd.read_csv('../Data/df3.csv')

In [15]:
categorical.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Cabin

We will preserve only the letter

In [16]:
def cabinextractor(string):
    """
    Takes whatever is not a letter out of the string.a
    """
    import re
    try:
        clean = re.sub("[^a-zA-Z]", "", string)
        if len(clean) > 1:
            return clean[0]
        else:
            return clean
    except:
        pass

In [17]:
categorical.Cabin = categorical.Cabin.apply(cabinextractor)

In [18]:
categorical.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [19]:
categorical.Cabin.value_counts()

C    59
B    47
D    33
E    32
A    15
F    13
G     4
T     1
Name: Cabin, dtype: int64

# Name

We will get the length of the string containing the name to check if it correlates with mortality:

In [8]:
categorical.Name = categorical.Name.apply(lambda x:len(x))

# Ticket 

Similarly to what was done with the cabins, we will extract the letters for the ticket, to check if there is a pattern or a majoritary group:

In [9]:
def ticketextractor(string):
    """
    Takes whatever is not a letter out of the string.a
    """
    import re
    try:
        clean = re.sub("[^a-zA-Z]", "", string)
        return clean
    except:
        pass

In [10]:
# Getting the letters from the ticket:
categorical.Ticket = categorical.Ticket.apply(ticketextractor)

# Extracting Top5 Categories:
top5 = categorical.Ticket.apply(ticketextractor).value_counts().head()

There are two main groups with letters in ticket, PC and CA. We will classify this variable as Number (only numeric ticket), PC, CA, or other.

We replace the empty string with 'Numeric', get PC and CA as classes and the rest will be codified as 'Other':

In [11]:
# Empty string as 'Numeric'
categorical.loc[categorical.Ticket == '', 'Ticket'] = 'Numeric'

# Grouping Top3 categories in a list:
top5 = categorical.Ticket.apply(ticketextractor).value_counts().head()
top3 = top5.nlargest(3).index

# All remaining rows will be set to 'Other'
categorical.Ticket = categorical.Ticket.where(categorical.Ticket.isin(top3), other='Other')

In [12]:
categorical.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,23,male,22.0,1,0,Other,7.25,,S
1,1,1,51,female,38.0,1,0,PC,71.2833,C,C
2,1,3,22,female,26.0,0,0,Other,7.925,,S
3,1,1,44,female,35.0,1,0,Numeric,53.1,C,S
4,0,3,24,male,35.0,0,0,Numeric,8.05,,S
