In [31]:
import pandas as pd

In [32]:
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
gender_submission = pd.read_csv("gender_submission.csv")

In [33]:
gender_submission.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [42]:
# Since sinking of the RMS Titanic is a historical event, it is possible to get 
# full Titanic passenger information.  It is natural to get survived status by 
# passing passengers' names to search APIs.  
# 
# First we use free Bing Search API to query all passengers automatically.  
# Surely we cannot get good informations of some passengers.  So next we need 
# to manually leverage Google Search to complete the full submission.

from difflib import SequenceMatcher

import json
import requests
import time

def get_score(one, another):
    return SequenceMatcher(None, one, another).ratio()

def search(name):
    # https://azure.microsoft.com/zh-tw/try/cognitive-services/my-apis/
    url = 'https://api.cognitive.microsoft.com/bing/v7.0/search?q={name}&count=100&offset=0'.format(name=name)
    headers = { 
        "Ocp-Apim-Subscription-Key" : "<YOUR_API_KEY>"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != requests.codes.ok:
        return "STATUS_CODE_ERROR: " + str(response.status_code)
    
    root = json.loads(response.text)
    
    possible_victim_score = 0.0
    possible_survivor_score = 0.0
    for item in root["webPages"]["value"]:
        if "titanic-victim" in item["url"]:
            score = get_score(name, item["name"][0:item["name"].find(":") - 1])
            possible_victim_score += score
        if "titanic-survivor" in item["url"]:
            score = get_score(name, item["name"][0:item["name"].find(":") - 1])
            possible_survivor_score += score
    if possible_victim_score > possible_survivor_score:
        return "Victim"
    elif possible_victim_score < possible_survivor_score:
        return "Survivor"
    else:
        return "Unknown"
    
survived_map = {}
for index, row in test.iterrows():
    result = search(row.Name)
    if result == "Victim":
        survived_map[row.PassengerId] = 0
    elif result == "Survivor":
        survived_map[row.PassengerId] = 1
    else:
        print result, "=>", row.Name

Unknown => Johnston, Mrs. Andrew G (Elizabeth Lily" Watson)"
Unknown => Fortune, Mrs. Mark (Mary McDougald)
Unknown => Miles, Mr. Frank
Unknown => O'Donoghue, Ms. Bridget
Unknown => Guest, Mr. Robert
Unknown => Watt, Miss. Bertha J
Unknown => Kennedy, Mr. John
Unknown => Peruschitz, Rev. Joseph Maria
Unknown => Douglas, Mrs. Frederick Charles (Mary Helene Baxter)
Unknown => Baimbrigge, Mr. Charles Robert
Unknown => Thomson, Mr. Alexander Morrison
Unknown => Karnes, Mrs. J Frank (Claire Bennett)
Unknown => Mahon, Mr. John
Unknown => Warren, Mr. Charles William
Unknown => Saade, Mr. Jean Nassr
Unknown => Ford, Mr. Arthur
Unknown => Denbury, Mr. Herbert
Unknown => Conlon, Mr. Thomas Henry


In [43]:
print survived_map

{892: 0, 893: 1, 894: 0, 895: 0, 896: 1, 897: 1, 898: 0, 899: 1, 900: 1, 901: 0, 902: 0, 903: 0, 904: 1, 905: 0, 906: 0, 907: 1, 908: 0, 909: 0, 910: 0, 911: 1, 912: 0, 913: 1, 914: 1, 915: 1, 916: 1, 917: 0, 918: 1, 919: 0, 920: 0, 921: 0, 922: 0, 923: 0, 924: 1, 926: 1, 927: 0, 928: 1, 929: 0, 930: 1, 931: 0, 932: 1, 933: 0, 934: 0, 935: 0, 936: 1, 937: 0, 938: 1, 939: 0, 940: 1, 941: 1, 942: 0, 943: 0, 944: 1, 945: 1, 946: 0, 947: 0, 948: 0, 949: 1, 950: 0, 951: 1, 952: 0, 953: 0, 954: 0, 955: 1, 956: 1, 957: 0, 958: 0, 959: 0, 960: 1, 962: 1, 963: 0, 964: 0, 965: 0, 966: 1, 967: 0, 969: 1, 970: 0, 971: 0, 972: 0, 973: 0, 974: 0, 975: 0, 976: 0, 977: 0, 978: 0, 979: 1, 981: 1, 982: 1, 983: 0, 984: 1, 986: 0, 987: 1, 988: 1, 989: 0, 990: 0, 991: 0, 992: 1, 993: 0, 994: 0, 995: 1, 996: 1, 997: 0, 998: 1, 999: 1, 1000: 0, 1001: 0, 1002: 0, 1003: 1, 1004: 0, 1005: 0, 1006: 0, 1007: 0, 1008: 0, 1009: 1, 1010: 0, 1011: 0, 1013: 0, 1014: 1, 1015: 0, 1017: 1, 1018: 0, 1019: 1, 1020: 0, 1021

In [46]:
for index, row in test.iterrows():
    if row.PassengerId not in survived_map:
        print row.PassengerId, row.Name

925 Johnston, Mrs. Andrew G (Elizabeth Lily" Watson)"
961 Fortune, Mrs. Mark (Mary McDougald)
968 Miles, Mr. Frank
980 O'Donoghue, Ms. Bridget
985 Guest, Mr. Robert
1012 Watt, Miss. Bertha J
1016 Kennedy, Mr. John
1056 Peruschitz, Rev. Joseph Maria
1076 Douglas, Mrs. Frederick Charles (Mary Helene Baxter)
1090 Baimbrigge, Mr. Charles Robert
1111 Thomson, Mr. Alexander Morrison
1138 Karnes, Mrs. J Frank (Claire Bennett)
1148 Mahon, Mr. John
1159 Warren, Mr. Charles William
1166 Saade, Mr. Jean Nassr
1181 Ford, Mr. Arthur
1230 Denbury, Mr. Herbert
1291 Conlon, Mr. Thomas Henry


In [49]:
survived_map[925] = 0
survived_map[961] = 1
survived_map[968] = 0
survived_map[980] = 0
survived_map[985] = 0
survived_map[1012] = 1
survived_map[1016] = 1
survived_map[1056] = 0
survived_map[1076] = 1
survived_map[1090] = 0
survived_map[1111] = 0
survived_map[1138] = 0
survived_map[1148] = 0
survived_map[1159] = 0
survived_map[1166] = 0
survived_map[1181] = 0
survived_map[1230] = 0
survived_map[1291] = 0

In [55]:
passenger_id_list = []
survived_list = []

for index, row in test.iterrows():
    passenger_id_list.append(row.PassengerId)
    survived_list.append(survived_map[row.PassengerId])

submission_items = [
    ('PassengerId', passenger_id_list),
    ('Survived', survived_list)
]
search_submission = pd.DataFrame.from_items(submission_items)

In [57]:
search_submission.to_csv("search_submission.csv", index=False)