In [1]:
import pandas as pd
import numpy as np
import re
import itertools

###1. Load train and test data

In [2]:
train = pd.read_csv("data/train.csv")
train["dataset"] = "train"
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,dataset
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,train
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,train
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S,train
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S,train
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S,train


In [3]:
test = pd.read_csv("data/test.csv")
test["dataset"] = "test"
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,dataset
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,test
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,test
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,test
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,test
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,test


In [4]:
#Combine both datasets to predict families
train = train.append(test)
train.set_index(train["PassengerId"],inplace=True)

##2. Tokenize name into (surname, title, first name and maiden name)

In [5]:
name_tokenizer = re.compile(r"^(?P<surname>[^,]+), (?P<title>[A-Z a-z]+?)\. (?P<f_name>[A-Z a-z.]+)?(?P<maiden_name>\([A-Za-z .]+\))?")

In [6]:
name_tokens = ["surname","title","f_name","maiden_name"]
for name_tk in name_tokens:
    train[name_tk] = train.Name.apply(lambda x: name_tokenizer.match(x).group(name_tk))
    test[name_tk] = test.Name.apply(lambda x: name_tokenizer.match(x).group(name_tk))
train.head(n=5)

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,dataset,surname,title,f_name,maiden_name
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,22,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0,A/5 21171,train,Braund,Mr,Owen Harris,
2,38,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1,PC 17599,train,Cumings,Mrs,John Bradley,(Florence Briggs Thayer)
3,26,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1,STON/O2. 3101282,train,Heikkinen,Miss,Laina,
4,35,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1,113803,train,Futrelle,Mrs,Jacques Heath,(Lily May Peel)
5,35,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0,373450,train,Allen,Mr,William Henry,


###2.1 Extract features from Title variable

In [7]:
print train.groupby(["title","Sex"]).size()

title         Sex   
Capt          male        1
Col           male        4
Don           male        1
Dona          female      1
Dr            female      1
              male        7
Jonkheer      male        1
Lady          female      1
Major         male        2
Master        male       61
Miss          female    260
Mlle          female      2
Mme           female      1
Mr            male      757
Mrs           female    197
Ms            female      2
Rev           male        8
Sir           male        1
the Countess  female      1
dtype: int64


It seems we can extract some info from title
1. Whether a woman is married Mme/Mrs vs Miss/Mlle  vs Ms(Undetermined or single :/? )
2. Master title apparently given to male kids
2. Nobility vs laypeople : (Dr, Col, Capt,  ...) vs (Mr,Master,Mrs,Miss). Ambiguous cases (Mlle,Mme,Ms,Don/Dona?)

In [8]:
#Encode special title following this logic
train.has_special_title = train.title.apply(lambda x: x not in ["Mr","Mrs","Miss","Mme","Mlle","Master"])

##3 Examine marriages / sibling relationships

In [14]:
def is_married(couple_rows):
    are_married=False
    if couple_rows.irow(0).Sex != couple_rows.irow(1).Sex:
        #Get who is the husband and whose the wife
        man = couple_rows.irow(0) if couple_rows.irow(0).Sex == "male" else couple_rows.irow(1)
        woman = couple_rows.irow(0) if couple_rows.irow(0).Sex == "female" else couple_rows.irow(1)

        #Marriage tests
        marriage_tests = {}
        marriage_tests["same_f_name"] = woman.f_name is not None and woman.f_name in man.f_name
        marriage_tests["consistent_title"] =  woman.title not in ("Miss","Mlle") and man.title != "Master"
        marriage_tests["same_ticket"] = woman.Ticket == man.Ticket
        marriage_tests["same_pclass"] = woman.Pclass == man.Pclass
        marriage_tests["legal_age"] = (woman.title in ("Mme","Mrs") or woman.Age >= 10) and man.Age > 10
        marriage_tests["consistent_SibSp"] = (woman.SibSp > 0 and man.SibSp > 0) or (woman.SibSp == man.SibSp)

        are_married = marriage_tests["same_f_name"] and marriage_tests["legal_age"] or ( )
        
        consistency_checks = ( marriage_tests["consistent_title"] and 
                               marriage_tests["legal_age"] and 
                              marriage_tests["same_pclass"] and 
                              marriage_tests["same_ticket"] and
                              marriage_tests["consistent_SibSp"])

        if are_married and not consistency_checks:
            failed_tests = ", ".join("{}:{}".format(x,marriage_tests[x]) for x in marriage_tests if not marriage_tests[x])
            print "WARNING: Sketchy marriage: {}".format(failed_tests)
            print couple_rows
            
            print

    return are_married

##Initialize data structures for algorithm

In [15]:
#Data structures - sets to keep track which ones have already been assigned
married_people = set()
people_with_parents = set()

In [16]:
links_to_assign = train[["SibSp","Parch"]]
#Matches a couple with the Max amount of kids they can have 
#Which is the min(husband.Parch, wife.Parch)
marriages_table = {}

### 1. Extract marriages in greedy fashion. Assume is_married has no fp ( might have actually :/ )

In [17]:
#Subset only people who have spouses/siblings on the boat
train_sibsp = train.ix[ train.SibSp > 0]
#People grouped by surname
surname_groups = train_sibsp.groupby("surname").groups

In [18]:
for surname in surname_groups:
    surname_rows = surname_groups[surname]
    couples = itertools.combinations(surname_rows,2)
    for cpl in couples:
        cpl_rows = train_sibsp.ix[list(cpl)]
        if is_married(cpl_rows):
            #Make sure we're not marrying somebody twice :p
            assert cpl[0] not in married_people,"{} is already married :/".format(cpl[0])
            assert cpl[1] not in married_people,"{} is already married :/".format(cpl[1])
            
            #add couples to married set
            married_people.add(cpl[0])
            married_people.add(cpl[1])
            
            marriages_table[cpl] = min(links_to_assign.ix[cpl[0]]["Parch"], links_to_assign.ix[cpl[1]]["Parch"] )
            
            
            #print
#    break

In [19]:
marriages_table

{(26, 1066): 5, (94, 924): 2, (152, 337): 0, (609, 686): 2, (737, 1059): 2}

In [None]:
train.ix[list((26,1066))]

In [26]:
train.ix[ (train.SibSp > 0) | (train.Parch > 0) ].shape

(519, 17)

In [22]:
train

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,dataset,surname,title,f_name,maiden_name
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,22.0,,S,7.2500,"Braund, Mr. Owen Harris",0,1,3,male,1,0,A/5 21171,train,Braund,Mr,Owen Harris,
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1,PC 17599,train,Cumings,Mrs,John Bradley,(Florence Briggs Thayer)
3,26.0,,S,7.9250,"Heikkinen, Miss. Laina",0,3,3,female,0,1,STON/O2. 3101282,train,Heikkinen,Miss,Laina,
4,35.0,C123,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1,113803,train,Futrelle,Mrs,Jacques Heath,(Lily May Peel)
5,35.0,,S,8.0500,"Allen, Mr. William Henry",0,5,3,male,0,0,373450,train,Allen,Mr,William Henry,
6,,,Q,8.4583,"Moran, Mr. James",0,6,3,male,0,0,330877,train,Moran,Mr,James,
7,54.0,E46,S,51.8625,"McCarthy, Mr. Timothy J",0,7,1,male,0,0,17463,train,McCarthy,Mr,Timothy J,
8,2.0,,S,21.0750,"Palsson, Master. Gosta Leonard",1,8,3,male,3,0,349909,train,Palsson,Master,Gosta Leonard,
9,27.0,,S,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,female,0,1,347742,train,Johnson,Mrs,Oscar W,(Elisabeth Vilhelmina Berg)
10,14.0,,C,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,female,1,1,237736,train,Nasser,Mrs,Nicholas,(Adele Achem)
