In [1]:
import pandas
import numpy

# LOAD TRAIN DATA

train_data = pandas.read_csv("../../data/titanic/train.csv")

train_data.head(25)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [2]:
# CLEAN TRAIN DATA I

print("PRE-CLEAN: TOTAL AMOUNT OF RECORDS: {} \n".format(len(train_data)))

for column in train_data.columns:
    
    print("{}: {} empty records".format(column, len(train_data.loc[pandas.isnull(train_data[column])])))
    
# DELETING ROWS WITHOUT AGE

train_data = train_data[pandas.notnull(train_data.Age)]

# DELETING CABIN COLUMN

del train_data['Cabin']

print("\nPOST-CLEAN: TOTAL AMOUNT OF RECORDS: {} \n".format(len(train_data)))

PRE-CLEAN: TOTAL AMOUNT OF RECORDS: 891 

PassengerId: 0 empty records
Survived: 0 empty records
Pclass: 0 empty records
Name: 0 empty records
Sex: 0 empty records
Age: 177 empty records
SibSp: 0 empty records
Parch: 0 empty records
Ticket: 0 empty records
Fare: 0 empty records
Cabin: 687 empty records
Embarked: 2 empty records

POST-CLEAN: TOTAL AMOUNT OF RECORDS: 714 



In [3]:
## SOMETHING ABOUT EMBARKED

train_data.loc[pandas.isnull(train_data.Embarked)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,


In [4]:
# CLEAN TRAIN DATA II

## ADD AGE CLASSIFICATION

train_data.insert(6, 'AgeClass', '')

ageClassifications = [
    {
        'name': 'Child',
        'min': 0,
        'max': 14
    },
    {
        'name': 'Young adult',
        'min': 15,
        'max': 24
    },
    {
        'name': 'Adult',
        'min': 25,
        'max': 50
    },
    {
        'name': 'Senior',
        'min': 51,
        'max': 80
    }
]

for index, row in train_data.iterrows():
    
    rowAge = row['Age']
    ageClass = 'Senior'
    
    for ageClassification in ageClassifications:
        
        if ageClassification['min'] <= rowAge and rowAge <= ageClassification['max']:
            
            ageClass = ageClassification['name']
            
    train_data.at[index, 'AgeClass'] = ageClass

## SORT DATA ON TICKET NUMBER AND PASSENGER ID

train_data = train_data.sort_values(by=['Ticket', 'PassengerId'])

train_data[train_data.duplicated(subset=['Ticket'], keep=False)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,AgeClass,SibSp,Parch,Ticket,Fare,Embarked
257,258,1,1,"Cherry, Miss. Gladys",female,30.00,Adult,0,0,110152,86.5000,S
504,505,1,1,"Maioni, Miss. Roberta",female,16.00,Young adult,0,0,110152,86.5000,S
759,760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dye...",female,33.00,Adult,0,0,110152,86.5000,S
262,263,0,1,"Taussig, Mr. Emil",male,52.00,Senior,1,1,110413,79.6500,S
558,559,1,1,"Taussig, Mrs. Emil (Tillie Mandelbaum)",female,39.00,Adult,1,1,110413,79.6500,S
585,586,1,1,"Taussig, Miss. Ruth",female,18.00,Young adult,0,2,110413,79.6500,S
329,330,1,1,"Hippach, Miss. Jean Gertrude",female,16.00,Young adult,0,1,111361,57.9792,C
523,524,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",female,44.00,Adult,0,1,111361,57.9792,C
61,62,1,1,"Icard, Miss. Amelie",female,38.00,Adult,0,0,113572,80.0000,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.00,Senior,0,0,113572,80.0000,


In [5]:
train_data.groupby(["AgeClass"]).mean().sort_values(by='Survived')

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
AgeClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Senior,442.378788,0.333333,1.590909,57.530303,0.257576,0.287879,42.624433
Young adult,432.623116,0.366834,2.462312,20.190955,0.386935,0.321608,31.117734
Adult,462.52957,0.403226,2.150538,34.814516,0.352151,0.327957,35.773521
Child,427.766234,0.584416,2.623377,5.703506,1.831169,1.337662,31.928464
