In [1]:
import time
import os
import pandas as pd
import numpy as np
from numpy import linalg as LA
from datetime import datetime
from datetime import timedelta
import geopy.distance

In [2]:
class TargetRoadFileMaker:
    """make csv file contains GPS only in the given road boundaries.
    
    Parameters:
    ----------
    road_boundaries: the numpy array of 4 points which make the boundaries of oen road segment.
    file_name: the name of csv file which contains all GPS data of Didi drivers in Xi'an city.
    data: a datafame version of imported GPS csv file.
    data_in_road_boundaries: a dataframe with GPS point only in given road boundaries.
    
    """
    def __init__(self, file_name="100_drivers_data.csv"):
        self.road_boundaries = np.array([[108.94765628015638, 34.23940650580562],
                                         [108.94615212765522, 34.23940384395769],
                                         [108.94615156073496, 34.2324012260476],
                                         [108.94765571288106, 34.232403887889106],
                                         ])
        self.file_name = file_name
        self.data = self.import_csv_into_me()
        self.data_in_road_boundaries = None
        self.get_the_dataframe_convering_only_target_road_segment()
        self.create_column_for_is_from_south()

    def change_road_boundaries(self, new_road_boundaries):
        """change the GPS of road boundaries if the speed for other road segments is going to be investigated.
        
        Parameters
        ----------
        new_road_boundaries: the numpy array of 4 points which make the boundaries of oen road segment.
        
        Returns
        ----------
        self.road_boundaries: the changed variable road_boundaries of class TargetRoadFileMaker.
        
        """
        self.road_boundaries = np.array(new_road_boundaries)
        return self.road_boundaries

    def import_csv_into_me(self):
        """import csv file of given day as pandas dataframe.
        
        Returns
        ---------- 
        data: the imported dataframe.
        
        """
        columns_names = ['Driver_ID', 'Order_ID', 'Timestamp', 'Longitude', 'Latitude']
        data = pd.read_csv(self.file_name, header=None, names=columns_names)
        return data

    def get_coordinate_array_from_dataframe_row(self, row):
        """make a numpy array of coordinates from row of dataframe.
        
        Parameters
        ----------
        row: a row of dataframe
        
        Returns
        ----------
        coordinate: the numpy array of coordinate.
        
        """
        coordinate = []
        coordinate.append(row['Longitude'])
        coordinate.append(row['Latitude'])
        coordinate = np.array(coordinate)
        return coordinate

    def is_point_in_the_target_road_segment(self, row):
        """Determine if one coordinate is within the boundary of target raod segment.

        Parameters
        ----------
        row: a row of dataframe contains coordinate information.

        Returns
        ----------
        result: type Boolean, indicating whether given point is in the target road segment.
        """
        coordinate = self.get_coordinate_array_from_dataframe_row(row)
        point_p = coordinate
        point_a = self.road_boundaries[0]
        point_b = self.road_boundaries[1]
        point_c = self.road_boundaries[2]
        vector_ab = np.array([point_a[0] - point_b[0], point_a[1] - point_b[1]])
        vector_ap = np.array([point_a[0] - point_p[0], point_a[1] - point_p[1]])
        vector_bc = np.array([point_b[0] - point_c[0], point_b[1] - point_c[1]])
        vector_bp = np.array([point_b[0] - point_p[0], point_b[1] - point_p[1]])
        dot_mul_ab_ap = np.dot(vector_ab, vector_ap)
        dot_mul_ab_ab = np.dot(vector_ab, vector_ab)
        dot_mul_bc_bp = np.dot(vector_bc, vector_bp)
        dot_mul_bc_bc = np.dot(vector_bc, vector_bc)
        condition_1 = (0 <= dot_mul_ab_ap)
        condition_2 = (dot_mul_ab_ap <= dot_mul_ab_ab)
        condition_3 = (0 <= dot_mul_bc_bp)
        condition_4 = (dot_mul_bc_bp <= dot_mul_bc_bc)
        result = np.all([condition_1, condition_2, condition_3, condition_4])
        return result

    def get_the_dataframe_convering_only_target_road_segment(self):
        """Filter out the GPS point outside the target road segment over the whole network.

        Returns
        ----------
        self.data_in_road_boundaries: returned dataframe with each row's coordiantes inside the target road boundaries.
        
        """
        data = self.data
        data['Is_in_target_road_segment'] = self.data.apply(lambda row: self.is_point_in_the_target_road_segment(row),
                                                            axis=1)
        self.data_in_road_boundaries = data[data['Is_in_target_road_segment'] == True]
        return self.data_in_road_boundaries

    def convert_unix_time_to_daily_time(self, row):
        """Convert Unix time formatting to '%Y-%m-%d %H:%M:%S' format.

        Paramters
        ----------
        row: a row of dataframe contains timestamps in unix format.

        Returns
        ----------
        output: type string, with the format of '%Y-%m-%d %H:%M:%S'.
        """
        ts = int(row['Timestamp'])
        output = datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
        return output

    def create_readable_timestamp_in_df(self):
        """add a column of 'Readable_time' in class varible self.data.
        
        """
        self.data['Readable_time'] = self.data.apply(lambda row: self.convert_unix_time_to_daily_time(row), axis=1)
        return None        
        
    def is_trip_from_south(self, trip_df):
        """Determine if one car in one trip is from south.

        Parameters
        ----------
        trip_df: the dataframe contains the GPS trace of one order (a continuous sequence of GPS trace).
        
        Returns
        ----------
        type Boolean, true if the trip origins from south. False if the trip origins from north.
       
        """
        trip_df = trip_df.sort_values(by=['Timestamp'])
        if (trip_df.tail(1)['Latitude'].values - trip_df.head(1)['Latitude'].values) > 0:
            return True
        else:
            return False

    def create_column_for_is_from_south(self):
        """add a column to existing dafaframe indicating the origin of car.

        Returns
        ----------
        self.data_in_road_boundaries: the changed variable of class, data_in_road_boundaries.
        
        """
        data = self.data_in_road_boundaries
        data['Is_from_south'] = None
        for order in data['Order_ID'].unique():
            trip_data = data[data['Order_ID'] == order]
            trip_index = trip_data.index
            Is_from_south = self.is_trip_from_south(trip_data)
            data.loc[trip_index, 'Is_from_south'] = Is_from_south
        self.data_in_road_boundaries = data
        return self.data_in_road_boundaries
    
    def convert_extracted_data_to_csv(self):
        """convert the dataframe with GPS only in target road segement into csv file and output to root directory. 
        
        """
        output_file_name = self.file_name.split(os.sep)[-1].split('.')[0] + '_target_road.csv'
        self.data_in_road_boundaries.to_csv(output_file_name)
        print('the data has been written to csv!')    

In [3]:
def convert_extracted_data_to_csv(file_maker):
    output_file_name = file_maker.file_name.split(os.sep)[-1].split('.')[0] + '_target_road.csv'
    file_maker.data_in_road_boundaries.to_csv(output_file_name)
    print('the data has been written to csv!')    

In [11]:
root_path = os.path.normpath(os.getcwd() + os.sep + os.pardir)
data_path = os.path.normpath(root_path + os.sep + 'data_all')
files = os.listdir(path=data_path)
#iterate through all 30 days of GPS csv files and output GPS csv files only contains points only within target road segment.
for file in files:
    start_time = time.time()
    file_path = os.path.normpath(data_path + os.sep + file)
    file_maker_one_day = TargetRoadFileMaker(file_name=file_path)
    file_maker_one_day.convert_extracted_data_to_csv()
    print('time cost = ',(time.time()-start_time)/60, 'min.')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


the data has been written to csv!
time cost =  20.27313994963964 min.
the data has been written to csv!
time cost =  20.90843149423599 min.
the data has been written to csv!
time cost =  20.658050366242726 min.
the data has been written to csv!
time cost =  21.743646836280824 min.
the data has been written to csv!
time cost =  20.859553591410318 min.
the data has been written to csv!
time cost =  20.443823571999868 min.
the data has been written to csv!
time cost =  17.831345391273498 min.
the data has been written to csv!
time cost =  16.106909358501433 min.
the data has been written to csv!
time cost =  16.806437849998474 min.
the data has been written to csv!
time cost =  16.75595631599426 min.
the data has been written to csv!
time cost =  16.18267393906911 min.
the data has been written to csv!
time cost =  17.093819177150728 min.
the data has been written to csv!
time cost =  13.265477228164674 min.
the data has been written to csv!
time cost =  15.136206257343293 min.
the data h

In [9]:
convert_extracted_data_to_csv(file_maker_one_day)

the data has been written to csv!
