In [1]:
#显示程序理解
#数据结构：乘客的各种条件及其所对应的值
#处理目的：将有关的列提取出来形成新的列表，无关列表删除，并进行显示

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
## 数据移除子函数
#目的：移除无关行，将有关列提取出来建立新的列表（相当于筛选）
def filter_data(data, condition):
    """
    Remove elements that do not match the condition provided.
    Takes a data list as input and returns a filtered list.
    Conditions should be a list of strings of the following format:
      '<field> <op> <value>'
    where the following operations are valid: >, <, >=, <=, ==, !=
    
    Example: ["Sex == 'male'", 'Age < 18']
    """

    field, op, value = condition.split(" ") # field相当于特征名称，op相当于符号，value该特征的值
    
    # convert value into number or strip excess quotes if string
    # 一般程序出现异常时会停止运行，但是有了try和except异常处理函数，则可保证出现异常也不影响程序的运行
    # try后的语句一定会执行，若其中出现异常则执行except中的语句，若无异常则直接执行except之后的语句
    # 每个try后至少有一个except
    # except后还可以用finally,无论异常是否发生，在程序结束前，finally中的语句都会被执行
    try:
        value = float(value) #将数据转化为浮点型，若有异常（遇到换行\）则切片
    except:
        value = value.strip("\'\"")
    # Python split()通过指定分隔符对字符串进行切片，如果参数num 有指定值，则仅分隔 num 个子字符串
    # 语法str.split(str="", num=string.count(str))
    # str -- 分隔符，默认为所有的空字符，包括空格、换行(\n)、制表符(\t)等。
    # num -- 分割次数。

    
    # get booleans for filtering
    if op == ">":
        matches = data[field] > value
    elif op == "<":
        matches = data[field] < value
    elif op == ">=":
        matches = data[field] >= value
    elif op == "<=":
        matches = data[field] <= value
    elif op == "==":
        matches = data[field] == value
    elif op == "!=":
        matches = data[field] != value
    else: # catch invalid operation codes
        raise Exception("Invalid comparison operator. Only >, <, >=, <=, ==, != allowed.")
    
    # filter data and outcomes
    # 把通过筛选的数据重置为数据表
    # DataFrame可以通过set_index方法，可以设置单索引和复合索引，即按某一列或某几列重置列表。 
    # DataFrame.set_index(keys, drop=True, append=False, inplace=False, verify_integrity=False)
    # reset_index可以还原索引，从新变为默认的整型索引 
    # DataFrame.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill=”) 
    data = data[matches].reset_index(drop = True)
    return data

In [6]:
# 将感兴趣的特征组成的生存数据显示出来
# 思路：
# 1，检测特征输入正确与否，否反馈；
# 2，如果考虑了没用的特征，比如Cabin等三个，则反馈说不用考虑这仨
# 3，满足上述两条后，运用上述移除函数，并加入结果，构成新的数据列表
# 4，处理数字类型特征，去掉没有值的乘客，根据特征取值范围给每个特征一个（显示）范围，并根据死亡比例充填这一范围
# 5，处理类别类型特征，设置变量列表，并统计每一类别的个数，据此画出每个特征下死亡与存活的比例
# 6，报告被剔除的个体
def survival_stats(data, outcomes, key, filters = []):
    # 1.Check that the key exists
    if key not in data.columns.values :
        print "'{}' is not a feature of the Titanic data. Did you spell something wrong?".format(key)
        return False
    
    # 2.Return the function before visualizing if 'Cabin' or 'Ticket'
    # is selected: too many unique categories to display
    if(key == 'Cabin' or key == 'PassengerId' or key == 'Ticket'):
        print "'{}' has too many unique categories to display! Try a different feature.".format(key)
        return False
    
    # 3.Merge data and outcomes into single dataframe
    # 连接特征和结果成为一个新的列表，axis=1时表横着对数据进行连接。axis=0或不指定时，表将数据竖着进行连接
    all_data = pd.concat([data, outcomes], axis = 1)

    # Apply filters to data
    # 运用移除函数，建立所需特征的行列
    for condition in filters:
        all_data = filter_data(all_data, condition)

    # Create outcomes DataFrame
    # 创建结果数据表
    all_data = all_data[[key, 'Survived']]
    
    # Create plotting figure
    # 创建图形窗口，宽高比例为8：6
    plt.figure(figsize=(8,6))

    # 4 'Numerical' features
    # 处理数字型的特征，比如年龄和船票，要去除不是数字的数，并将数据范围划分为区域，以及计算生存比例
    if(key == 'Age' or key == 'Fare'):
        
        # Remove NaN values from Age data
        all_data = all_data[~np.isnan(all_data[key])]#相当于取了原列表里不为nan的数组成一个新列表
        
        # Divide the range of data into bins and count survival rates
        min_value = all_data[key].min()
        max_value = all_data[key].max()
        value_range = max_value - min_value

        # 'Fares' has larger range of values than 'Age' so create more bins
        # 给fare多一点区域
        if(key == 'Fare'):
            bins = np.arange(0, all_data['Fare'].max() + 20, 20)
        if(key == 'Age'):
            bins = np.arange(0, all_data['Age'].max() + 10, 10)
        
        # Overlay each bin's survival rates
        # 覆盖（充填）每个区域的死亡比例
        # alpha为透明度
        nonsurv_vals = all_data[all_data['Survived'] == 0][key].reset_index(drop = True)
        surv_vals = all_data[all_data['Survived'] == 1][key].reset_index(drop = True)
        plt.hist(nonsurv_vals, bins = bins, alpha = 0.6,
                 color = 'red', label = 'Did not survive')
        plt.hist(surv_vals, bins = bins, alpha = 0.6,
                 color = 'green', label = 'Survived')
        
        # Add legend to plot
        plt.xlim(0, bins.max())
        plt.legend(framealpha = 0.8)
        
    # 'Categorical' features
    # 5. 类别类特征
    else:
       
        # Set the various categories
        # 设置变量类别
        if(key == 'Pclass'):
            values = np.arange(1,4)
        if(key == 'Parch' or key == 'SibSp'):
            values = np.arange(0,np.max(data[key]) + 1)
        if(key == 'Embarked'):
            values = ['C', 'Q', 'S']
        if(key == 'Sex'):
            values = ['male', 'female']
            
        # Create DataFrame containing categories and count of each
        # 创建数据列表，包含类别以及每一类的计数
        # 对于一个可迭代的（iterable）/可遍历的对象（如列表、字符串），enumerate将其组成一个索引序列，
        # 利用它可以同时获得索引和值
        # loc——通过行标签索引行数据
        # iloc——通过行号索引行数据
        # ix——通过行标签或者行号索引行数据（基于loc和iloc 的混合） 
        frame = pd.DataFrame(index = np.arange(len(values)), columns=(key,'Survived','NSurvived'))
        for i, value in enumerate(values):
            frame.loc[i] = [value, \
                   len(all_data[(all_data['Survived'] == 1) & (all_data[key] == value)]), \
                   len(all_data[(all_data['Survived'] == 0) & (all_data[key] == value)])]

        # Set the width of each bar
        bar_width = 0.4
        
        # Display each category's survival rates
        # 显示每个类别的死亡比例
        for i in np.arange(len(frame)):
            nonsurv_bar = plt.bar(i-bar_width, frame.loc[i]['NSurvived'], width = bar_width, color = 'r')
            surv_bar = plt.bar(i, frame.loc[i]['Survived'], width = bar_width, color = 'g')

            plt.xticks(np.arange(len(frame)), values)
            plt.legend((nonsurv_bar[0], surv_bar[0]),('Did not survive', 'Survived'), framealpha = 0.8)
            
    # Common attributes for plot formatting
    # 常规设置
    plt.xlabel(key)
    plt.ylabel('Number of Passengers')
    plt.title('Passenger Survival Statistics With \'%s\' Feature'%(key))
    plt.show()
    
    # Report number of passengers with missing values
    #6. 报告缺少值的乘客
    if sum(pd.isnull(all_data[key])):
        nan_outcomes = all_data[pd.isnull(all_data[key])]['Survived']
        print "Passengers with missing '{}' values: {} ({} survived, {} did not survive)".format( \
              key, len(nan_outcomes), sum(nan_outcomes == 1), sum(nan_outcomes == 0))