In [1]:
import pandas as pd
from pathlib import Path
from fiona.crs import from_epsg
from fiona import open as fopen
from shapely.geometry import Point, LineString
import ast

In [2]:
porto_csv_path = Path('./data/sources/porto/train.csv')
df = pd.read_csv(porto_csv_path)

In [3]:
min_two_points_mask = df["POLYLINE"].str.count(",") > 1
point_filter_df = df[min_two_points_mask]

In [4]:
schema = {
    'geometry': 'LineString',
    'properties': {
        'trip_id': 'int',
        'call_type': 'str:80',
        # 'origin_call': 'float:24.5',
        # 'origin_stand': 'float:24.5',
        'taxi_id': 'int',
        'timestamp': 'int',
        'day_type': 'str:80',
        'missing_data': 'int',
    }
}

porto_output_path = Path('./data/generated/porto/shp').resolve()

def append_to_shp(row, shp):
    polyline = ast.literal_eval(row['POLYLINE'])
    line_string = LineString(polyline)
    shp.write({
        'geometry': {
            'type': 'LineString',
            'coordinates': line_string.coords,
        },
        'properties': {
            'trip_id': row['TRIP_ID'],
            'call_type': row['CALL_TYPE'],
            # 'origin_call': row['ORIGINAL_CALL'],
            # 'origin_stand': row['ORIGIN_STAND'],
            'taxi_id': row['TAXI_ID'],
            'timestamp': row['TIMESTAMP'],
            'day_type': row['DAY_TYPE'],
            'missing_data': row['MISSING_DATA'],
        }
    })  
    

In [5]:
with fopen(porto_output_path, 'w', driver='ESRI Shapefile', crs=from_epsg(4326), schema=schema) as shp:
    point_filter_df.apply(append_to_shp, shp=shp, axis=1)

In [6]:
# How much percentage of trips were filtered out
explore_one_point_dfs = df[~min_two_points_mask]
ratio = explore_one_point_dfs.shape[0] / df.shape[0]
f"{round(ratio * 100, 3)}%"

'2.134%'