# Refactored from 'Titanic Data Journey.ipynb'

In [1]:
from titanic_package.data_load import data_load, data_missing, data_frequent, data_unique
from titanic_package.data_vis import agg_data, calc_fam_size, age_interval, fare_interval, sex_pclass

## Read the Data

In [2]:
train_df, test_df = data_load()

## Few statistics on the data

### Missing data

In [3]:
data_missing(train_df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Total,0,0,0,0,0,177,0,0,0,0,687,2
Percent,0.0,0.0,0.0,0.0,0.0,19.86532,0.0,0.0,0.0,0.0,77.104377,0.224467
Types,int64,int64,int64,object,object,float64,int64,int64,object,float64,object,object


In [4]:
data_missing(test_df)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Total,0,0,0,0,86,0,0,0,1,327,0
Percent,0.0,0.0,0.0,0.0,20.574163,0.0,0.0,0.0,0.239234,78.229665,0.0
Types,int64,int64,object,object,float64,int64,int64,object,float64,object,object


### Most frequent data

In [5]:
data_frequent(train_df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Total,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
Most frequent item,1.0,0.0,3.0,"Braund, Mr. Owen Harris",male,24.0,0.0,0.0,347082.0,8.05,B96 B98,S
Frequence,1.0,549.0,491.0,1,577,30.0,608.0,678.0,7.0,43.0,4,644
Percent from total,0.112,61.616,55.107,0.112,64.759,4.202,68.238,76.094,0.786,4.826,1.961,72.441


In [6]:
data_frequent(test_df)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Total,418.0,418.0,418,418,332.0,418.0,418.0,418,417.0,91,418
Most frequent item,892.0,3.0,"Kelly, Mr. James",male,21.0,0.0,0.0,PC 17608,7.75,B57 B59 B63 B66,S
Frequence,1.0,218.0,1,266,17.0,283.0,324.0,5,21.0,3,270
Percent from total,0.239,52.153,0.239,63.636,5.12,67.703,77.512,1.196,5.036,3.297,64.593


### Unique Values

In [7]:
data_unique(train_df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Total,891,891,891,891,891,714,891,891,891,891,204,889
Uniques,891,2,3,891,2,88,7,7,681,248,147,3


In [8]:
data_unique(test_df)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Total,418,418,418,418,332,418,418,418,417,91,418
Uniques,418,3,418,2,79,7,8,363,169,76,3


## Univariate analysis for all features


We show here two graphs in parallel:
* distribution of class values, split per Survived value
* comparison of class values, in train and test data


Let's first aggregate train and test data into one single dataframe, `all_df`.

In [9]:
all_df = agg_data(train_df, test_df)

In [10]:
all_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,set
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train


In [11]:
# Plot count pairs "Sex"
 

In [12]:
# Plot distribution pairs for "Sex" and hue as "Survived"


In [13]:
# Plot count pairs using all_df for the columns: Sex, Pclasss, SibSp, Parch, Embarked

In [14]:
# Plot count pairs using all_df for the columns: Sex, Pclasss, SibSp, Parch, Embarked and use "Survived" as hue.

In [15]:
# Plot distribution pairs for Age and Fare

In [16]:
# Plot distribution pairs for Age and Fare using "Survived" as hue

## Family size


Based on SibSp (sibilings or spouse) and Parch (parents or children), we set the Family Size field.

In [17]:
calc_fam_size(all_df)

In [18]:
calc_fam_size(train_df)

In [19]:
# Plot count pairs using all_df for the column "Family Size" and use "Survived" as hue.

## Age interval

In [20]:
age_interval(all_df)

In [21]:
age_interval(train_df)

In [22]:
all_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,set,Family Size,Age Interval
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,2,1.0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,2,2.0
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train,1,1.0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train,2,2.0
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train,1,2.0


In [23]:
# Plot count pairs using all_df for the column "Age Interval" and use "Survived" as hue.

## Fare interval

In [24]:
fare_interval(all_df)

In [25]:
fare_interval(train_df)

In [26]:
# Plot count pairs using all_df for the column "Fare Interval"

Let's create a composed feature: Pclass + Sex.

In [27]:
sex_pclass(train_df)

In [28]:
sex_pclass(all_df)

In [29]:
# Plot count pairs using all_df for the column "Fare Interval" and "Fare (grouped by survival)" with "Survived" as hue