# 本例是将V0.1版本的分析过程抽象化，方便通过输入参数，进行更多参数组合分析

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline

In [2]:
def getEstTripsCount(df,n=5):  
    trips_count = df["TRIPS_COUNT"]
    estTripsCount = []
    l = len(trips_count)
    for i in range(l):
        if i < n-1:
            estTripsCount.append(0)
        else:
            estTripsCount.append(sum(trips_count[i+1-n:i+1]))
    return estTripsCount

def getRealTripsCount(df,n=5):
    trips_count = df["TRIPS_COUNT"]
    realTripsCount = []
    l = len(trips_count)
    for i in range(l):
        if l-i-1 < n:
            realTripsCount.append(0)
        else:
            realTripsCount.append(sum(trips_count[i+1:i+1+n]))
    return realTripsCount

In [9]:
data = pd.read_csv("from_06_01_5_minute_stat.csv")
data["FROM_TIME"] = pd.to_datetime(data["FROM_TIME"])
data.FROM_TIME.

In [10]:
def getEstAndRealTime(dataFrame, carStorageTime, disPersion):
    """
    返回预付发车量大于空车差量的开始时间列表和实际发车量大于空车差量的时间列表
    """
    carStorageCount = int(carStorageTime/5)
    # 计算空车差量
    dataFrame["VEHICLE_COUNT_dispersion"] = data.VEHICLE_COUNT*(1+disPersion)
    # 计算预估发车量
    estTripsCount = getEstTripsCount(dataFrame, n=carStorageCount)
    # 计算实际发车量
    realTripsCount = getRealTripsCount(dataFrame, n=carStorageCount)
    
    trips_count_dict = {
        "FROM_TIME":dataFrame["FROM_TIME"],
        "estTripsCount":estTripsCount,
        "realTripsCount":realTripsCount
    }
    # 合并数据集
    df = dataFrame.merge(pd.DataFrame(trips_count_dict), on="FROM_TIME")
    # 筛选出需要的记录 1.预估大于空车差量的结果 2.真实大于预估差量的结果 3.预估和真实均大于差量的结果
    df_est_lg = df[df["estTripsCount"]>df["VEHICLE_COUNT_dispersion"]]
    df_real_lg = df[df["realTripsCount"]>df["VEHICLE_COUNT_dispersion"]]
    # 获取筛选结果的时间列表，供返回
    df_est_lg_fromtime = df_est_lg["FROM_TIME"]
    df_real_lg_fromtime = df_real_lg["FROM_TIME"]
    return df_est_lg_fromtime, df_real_lg_fromtime   


In [11]:
def calcRate(df, carStorageTime, disPersion):
    est_time, real_time = getEstAndRealTime(df, carStorageTime, disPersion)
    est_time = set(est_time)
    real_time = set(real_time)
    
    accuracy_rate = len(est_time & real_time)/len(est_time)
    false_positive_rate = len(est_time-real_time)/len(est_time)
    return accuracy_rate, false_positive_rate

In [12]:
accuracy_rate, false_positive_rate = calcRate(data, 20, 0.05)

In [13]:
print(accuracy_rate, false_positive_rate)

0.4276315789473684 0.5723684210526315


In [14]:
carStorageTimes = [20,25,30,35,40,45,50,55,60]
disPersion = [0.05,0.1,0.15,0.2]

result = {
    "carStorageTimes":[],
    "disPersion":[],
    "accuracy_rate":[],
    "false_positive_rate":[]
}

for cst in carStorageTimes:
    for dp in disPersion:
        accuracy_rate, false_positive_rate = calcRate(data, cst, dp)
        result["carStorageTimes"].append(cst)
        result["disPersion"].append(dp)
        result["accuracy_rate"].append(accuracy_rate)
        result["false_positive_rate"].append(false_positive_rate)
        
df_result = pd.DataFrame(result)
df_result

Unnamed: 0,carStorageTimes,disPersion,accuracy_rate,false_positive_rate
0,20,0.05,0.427632,0.572368
1,20,0.1,0.416667,0.583333
2,20,0.15,0.347826,0.652174
3,20,0.2,0.316832,0.683168
4,25,0.05,0.472464,0.527536
5,25,0.1,0.455172,0.544828
6,25,0.15,0.434599,0.565401
7,25,0.2,0.434343,0.565657
8,30,0.05,0.577849,0.422151
9,30,0.1,0.557407,0.442593
