In [1]:
import pandas as pd
import numpy as np
import json

from datetime import datetime
import math
from sklearn.metrics import mean_squared_error

In [2]:
def pass_data(csv):
    df = pd.read_csv(csv)
    
    temp_datetime = df['datetime'].str.split(" ")
    df['date'] = temp_datetime.apply(lambda x: x[0])
    df['time'] = temp_datetime.apply(lambda x: x[1])
    df['time_fix'] = pd.to_datetime(df['time'], format=('%H:%M')).dt.time
    df['datetime'] = pd.to_datetime(df['datetime'], format=('%Y-%m-%d %H:%M:%S'))
    
    df = df.sort_values(['device_id','date','time'], ascending=[1,1,1])
    #print df.head()
    
    
    return df

In [3]:
def get_time(df, time_start, time_end):   
    time_df = df.loc[(df['time_fix'] >= time_start) & (df['time_fix'] < time_end)]
    return time_df

In [4]:
def x_y_split(input_SiteEngName,df):

    df_x = df.loc[df['device_id'] != input_SiteEngName]
    df_y = df.loc[df['device_id'] == input_SiteEngName]

    return df_x, df_y

In [5]:
def flter_distance(df_y, df_x, r):
    
    lon_center = df_y['lon'].unique()[0]
    lat_center = df_y['lat'].unique()[0]

    new_df_x = df_x.loc[(df_x['lon'] < lon_center + r) & (df_x['lat'] < lat_center + r) & \
                        (df_x['lon'] > lon_center - r) & (df_x['lat'] > lat_center - r)]


    return new_df_x

In [6]:
def data_to_column(df):
    
    mat_PM25 = []
    for i in df['device_id'].unique().tolist():
        #print 'i:  {}'.format(i)
        new_df = df.loc[(df['device_id'] == i)]
        PM25 = new_df['s_d0'].values
        mat_PM25.append(PM25)
            
    return mat_PM25

In [7]:
def my_model(mat_PM25_x, mat_PM25_y):
    
    y_pred = []
    y_true = []
    
    for j in range(len(mat_PM25_y[0])):
        x_list = []
        #print '## round:{}'.format(j)
        for i in range(len(mat_PM25_x)):
            if len(mat_PM25_x[i]) == len(mat_PM25_y[0]):
                #print 'i: {}, len(mat_PM25_x[i]):  {}'.format(i, len(mat_PM25_x[i]))
                x_list.append(mat_PM25_x[i][j])
                
        
        near_give_value = np.mean(np.array(x_list))
        real_value = mat_PM25_y[0][j]
        y_pred.append(near_give_value)
        y_true.append(real_value)
        #print 'estimated_value:  {}'.format(near_give_value)
        #print 'real_value:  {}'.format(real_value)
        #print 'diff:  {} '.format(abs(near_give_value-real_value))
        
        #print '----------------------------------------------------'
        
    return y_pred, y_true

In [8]:
def test_r_model(df_x, df_y, r_list):
    
    MSE_list = []
    for r in r_list:
        #print 'r:  {}'.format(r)
        new_df_x = flter_distance(df_y, df_x, r)
        
    
        # transfer data to matrix
        mat_PM25_x = data_to_column(new_df_x)
        mat_PM25_y = data_to_column(df_y)
        #print 'len(mat_PM25_x):  {}'.format(len(mat_PM25_x))
    
   
        #apply to model
        y_pred, y_true = my_model(mat_PM25_x, mat_PM25_y)
        MSE = mean_squared_error(y_true, y_pred)
    
        #print 'radius: {}'.format(r)
        #print 'MSE:  {}'.format(MSE)

        #print '-----------------------------------------------'
        MSE_list.append(MSE)

        
    #print 'best radius(MSE): {}, train_MSE: {}'.format(r_list[MSE_list.index(min(MSE_list))], min(MSE_list))

   
        
    return r_list[MSE_list.index(min(MSE_list))] 

In [9]:
def get_distance(data1, data2):
    points = zip(data1, data2)
    diffs_squared_distance = [pow(a - b, 2) for (a, b) in points]
    return math.sqrt(sum(diffs_squared_distance))

In [None]:
def main(): # input_SiteEngName 
    
    # load the data
    df = pass_data('passdata_airbox.csv')
    
    #for SiteEngName in df['SiteEngName'].unique().tolist():
    device_id = '74DA38AF47E6' 
    print 'input_device_id: {}'.format(device_id)
    
    MSE_r_list = []
    MAE_r_list = []
    
    range = pd.date_range(min(df['time']), max(df['time']), freq='20min').time
    time_list = zip(range[:-1],range[1:])

    for time in time_list:
        #print '### time range: {} ~ {}'.format(time[0],time[1])
        filter_df = get_time(df,time[0],time[1]) # input_time 
        
        # split x and y
        input_SiteEngName = device_id
            
        df_x, df_y = x_y_split(input_SiteEngName,filter_df)
    
    
        # filter df_x by setting r
        lon_lat_x = df_x[['lon','lat']].drop_duplicates().as_matrix()
        lon_lat_y = df_y[['lon','lat']].drop_duplicates().as_matrix()
        distances = [get_distance(x, lon_lat_y[0]) for x in lon_lat_x] # calculate the min distance
        
        
        
        r_list = np.arange( min(distances)+0.0001, 0.1, 0.002) #can be change
        min_MSE_r = test_r_model(df_x, df_y, r_list)
        MSE_r_list.append(min_MSE_r*96)
        
    
    #print type(range[:-1])
    time_and_r_dict = dict(zip(range[:-1], MSE_r_list))
    
    #print time_and_r_dict
    
    
    conv={}
    for i in time_and_r_dict:
        conv[str(i)]=time_and_r_dict[i]
    with open('data.txt', 'w') as outfile:
        json.dump(conv, outfile)
        

        
    #print '#==================================================================='    
    #print 'the nearest one distance(*96km): {} (lon,lat)'.format(min(distances))
    #print '       the nearest one distance: {} (km)'.format(min(distances)*96)
    #print '#==================================================================='


    ## =================================testing===================================
    
    
 
if __name__ == "__main__":
    main()

input_device_id: 74DA38AF47E6
