# NextWave

In [1]:
import csv

def read_csv(path):
    with open(path) as file:
        reader = csv.reader(file)
        keys = next(reader)
        data = [{k: v for k, v in zip(keys[1:], row[1:])} for row in reader]
    return data

In [2]:
import pandas as pd

d = pd.read_csv('../data/data_' + 'train' + '.csv', index_col=0)
f = pd.read_csv('../data/data_' + 'test' + '.csv', index_col=0)


In [3]:
from itertools import groupby

def transform(data):
    trajectories = [
        [
            {
                'trajectory_id': p['trajectory_id'],
                't': p['time_' + q],
                'x': p['x_' + q],
                'y': p['y_' + q]
            } for p in v for q in ['entry', 'exit']
        ] for k, v in groupby(data, key=lambda x: x['hash'])
    ]

    return [
        {
            'trajectory_id': t[-1]['trajectory_id'],
            'points': [
                {
                    't': p['t'],
                    'x': p['x'],
                    'y': p['y']
                } for p in t
            ]
        } for t in trajectories
    ]

In [4]:
def get_data(dset):
    return transform(
        read_csv('../data/data_' + dset + '.csv')
    )

In [5]:
data = {
    'train': get_data('train'),
    'test': get_data('test')
}

In [6]:
import json

def print_json(json_data, n_rows=5):
    print(json.dumps(json_data[:n_rows], indent=4))

In [7]:
print_json(data['train'])

[
    {
        "trajectory_id": "traj_0000a8602cf2def930488dee7cdad104_1_5",
        "points": [
            {
                "t": "07:04:31",
                "x": "3751013.769405791",
                "y": "-19093980.643996242"
            },
            {
                "t": "07:08:32",
                "x": "3750325.814952688",
                "y": "-19136339.920968838"
            },
            {
                "t": "07:20:34",
                "x": "3743937.1893760636",
                "y": "-19322467.547328167"
            },
            {
                "t": "07:25:42",
                "x": "3744974.687030255",
                "y": "-19319663.69860901"
            },
            {
                "t": "07:53:32",
                "x": "3744867.820319093",
                "y": "-19293564.923513815"
            },
            {
                "t": "08:03:25",
                "x": "3744815.500158423",
                "y": "-19292841.1535587"
            },
            {
        

In [8]:
print_json(data['test'])

[
    {
        "trajectory_id": "traj_00032f51796fd5437b238e3a9823d13d_31_5",
        "points": [
            {
                "t": "11:43:17",
                "x": "3773413.4773432137",
                "y": "-19098280.892543625"
            },
            {
                "t": "11:50:17",
                "x": "3773110.688328254",
                "y": "-19145078.966683947"
            },
            {
                "t": "12:21:37",
                "x": "3773198.630725983",
                "y": "-19143541.1976365"
            },
            {
                "t": "12:21:37",
                "x": "3773198.630725983",
                "y": "-19143541.1976365"
            },
            {
                "t": "12:34:27",
                "x": "3763759.851101621",
                "y": "-19213415.019378442"
            },
            {
                "t": "13:14:11",
                "x": "3771757.0433202093",
                "y": "-19110919.811305605"
            },
            {
       

In [9]:
center = {
    'x': {
        'min': 3750901.5068,
        'max': 3770901.5068
    },
    'y': {
        'min': -19268905.6133,
        'max': -19208905.6133
    }
}

In [10]:
def time_to_hour(time):
    split = time.split(':')
    return int(split[0]) + int(split[1]) / 60 + int(split[2]) / 3600


In [11]:
class Normalization:
    def __init__(self, data, arg, method='avg'):
        if arg == 't': d = [time_to_hour(t[arg]) for d in data for t in d['points']]
        else: d = [float(t[arg]) for d in data for t in d['points']]
        if method == 'avg':
            self.shift = sum(d) / len(d)
            self.scale = (sum((i - self.shift) ** 2 for i in d) / (len(d) - 1)) ** 0.5
        if method == 'minmax':
            self.shift = min(d)
            self.scale = max(d) - self.shift

    def __call__(self, x):
        return (x - self.shift) / self.scale

In [12]:
normalize = {
    't': Normalization(data['train'], arg='t', method='minmax'),
    'x': Normalization(data['train'], arg='x', method='avg'),
    'y': Normalization(data['train'], arg='y', method='avg')
}

In [13]:
for d in data['train']:
    for t in d['points']:
        t['t'] = normalize['t'](time_to_hour(t['t']))
        if t['x']: t['x'] = normalize['x'](float(t['x']))
        if t['y']: t['y'] = normalize['y'](float(t['y']))
    d['points'], d['label'] = d['points'][:-1], d['points'][-1]

for d in data['test']:
    for t in d['points']:
        t['t'] = normalize['t'](time_to_hour(t['t']))
        if t['x']: t['x'] = normalize['x'](float(t['x']))
        if t['y']: t['y'] = normalize['y'](float(t['y']))
    d['points'], d['label'] = d['points'][:-1], d['points'][-1]

            
center = {
    'x': {
        'min': normalize['x'](center['x']['min']),
        'max': normalize['x'](center['x']['max'])
    },
    'y': {
        'min': normalize['y'](center['y']['min']),
        'max': normalize['y'](center['y']['max'])
    }
}

In [14]:
print_json(data['train'])

[
    {
        "trajectory_id": "traj_0000a8602cf2def930488dee7cdad104_1_5",
        "points": [
            {
                "t": 0.44220486111111107,
                "x": -1.0520309207124239,
                "y": 1.6858486420122032
            },
            {
                "t": 0.4463888888888889,
                "x": -1.1289877050257355,
                "y": 1.1265862607439676
            },
            {
                "t": 0.4589236111111111,
                "x": -1.843639784594424,
                "y": -1.3308253613870333
            },
            {
                "t": 0.46427083333333335,
                "x": -1.7275819804200274,
                "y": -1.2938066201935454
            },
            {
                "t": 0.49326388888888884,
                "x": -1.7395364323522329,
                "y": -0.9492288995359056
            },
            {
                "t": 0.5035590277777778,
                "x": -1.7453891327770017,
                "y": -0.9396730870311486

In [None]:
print_json(data['test'])

[
    {
        "trajectory_id": "traj_00032f51796fd5437b238e3a9823d13d_31_5",
        "points": [
            {
                "t": 0.7325868055555556,
                "x": 1.453672014482781,
                "y": 1.62907318656214
            },
            {
                "t": 0.7398784722222222,
                "x": 1.419801067341463,
                "y": 1.0112061158041596
            },
            {
                "t": 0.7725173611111111,
                "x": 1.4296385850775575,
                "y": 1.031509020876688
            },
            {
                "t": 0.7725173611111111,
                "x": 1.4296385850775575,
                "y": 1.031509020876688
            },
            {
                "t": 0.7858854166666667,
                "x": 0.37378652327849365,
                "y": 0.10897675100361369
            },
            {
                "t": 0.8272743055555555,
                "x": 1.268378009534379,
                "y": 1.4622036762998767
            },


In [None]:
def in_center(x, y):
    return 1 if center['x']['min'] < x < center['x']['max'] and center['y']['min'] < y < center['y']['max'] else 0

for d in data['train']:
    # Compute avg velocity from last update
    q = None
    for p in d['points']:
        if q and q['t'] != p['t']: 
            p['v'] = ((p['x'] - q['x']) ** 2 + (p['y'] - q['y']) ** 2) ** 0.5 / (p['t'] - q['t'])
        else:
            p['v'] = 0
        q = p
        p['td'] = d['label']['t'] - p['t']
        p['inc'] = in_center(p['x'], p['y'])
        
    # Statistics
    d['avg_t'] = sum(t['t'] for t in d['points']) / len(d['points'])
    d['avg_x'] = sum(t['x'] for t in d['points']) / len(d['points'])        
    d['avg_y'] = sum(t['y'] for t in d['points']) / len(d['points'])
    d['avg_v'] = sum(t['v'] for t in d['points'][1:]) / len(d['points'][1:]) if d['points'][1:] else 0
    d['max_t'] = max(d['points'], key=lambda x: x['t'])
    d['min_t'] = min(d['points'], key=lambda x: x['t'])
    d['max_v'] = max(d['points'][1:], key=lambda x: x['v'], default=0)
    d['min_v'] = min(d['points'][1:], key=lambda x: x['v'], default=0)
    d['1h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 1 * 0.0625)))
    d['2h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 2 * 0.0625)))
    d['3h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 3 * 0.0625)))
    d['4h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 4 * 0.0625)))
    d['5h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 5 * 0.0625)))
    d['6h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 6 * 0.0625)))
    d['7h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 7 * 0.0625)))
    d['8h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 8 * 0.0625)))
    d['9h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 9 * 0.0625)))
    d['10h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 10 * 0.0625)))
    d['label']['inc'] = in_center(d['label']['x'], d['label']['y'])
    
for d in data['test']:
    # Compute avg velocity from last update
    q = None
    for p in d['points']:
        if q and q['t'] != p['t']: 
            p['v'] = ((p['x'] - q['x']) ** 2 + (p['y'] - q['y']) ** 2) ** 0.5 / (p['t'] - q['t'])
        else:
            p['v'] = 0
        q = p
        p['td'] = d['label']['t'] - p['t']
        p['inc'] = in_center(p['x'], p['y'])
        
    # Statistics
    d['avg_t'] = sum(t['t'] for t in d['points']) / len(d['points'])
    d['avg_x'] = sum(t['x'] for t in d['points']) / len(d['points'])        
    d['avg_y'] = sum(t['y'] for t in d['points']) / len(d['points'])
    d['avg_v'] = sum(t['v'] for t in d['points'][1:]) / len(d['points'][1:]) if d['points'][1:] else 0
    d['max_t'] = max(d['points'], key=lambda x: x['t'])
    d['min_t'] = min(d['points'], key=lambda x: x['t'])
    d['max_v'] = max(d['points'][1:], key=lambda x: x['v'], default=0)
    d['min_v'] = min(d['points'][1:], key=lambda x: x['v'], default=0)
    d['1h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 1 * 0.0625)))
    d['2h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 2 * 0.0625)))
    d['3h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 3 * 0.0625)))
    d['4h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 4 * 0.0625)))
    d['5h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 5 * 0.0625)))
    d['6h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 6 * 0.0625)))
    d['7h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 7 * 0.0625)))
    d['8h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 8 * 0.0625)))
    d['9h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 9 * 0.0625)))
    d['10h'] = min(d['points'], key=lambda x: abs(x['t'] - (d['label']['t'] - 10 * 0.0625)))
    

In [None]:
print_json(data['train'])

[
    {
        "trajectory_id": "traj_0000a8602cf2def930488dee7cdad104_1_5",
        "points": [
            {
                "t": 0.44220486111111107,
                "x": -1.0520309207124239,
                "y": 1.6858486420122032,
                "v": 0,
                "td": 0.5146180555555557,
                "inc": 0
            },
            {
                "t": 0.4463888888888889,
                "x": -1.1289877050257355,
                "y": 1.1265862607439676,
                "v": 134.92557062720348,
                "td": 0.5104340277777779,
                "inc": 0
            },
            {
                "t": 0.4589236111111111,
                "x": -1.843639784594424,
                "y": -1.3308253613870333,
                "v": 204.17034242116523,
                "td": 0.49789930555555567,
                "inc": 0
            },
            {
                "t": 0.46427083333333335,
                "x": -1.7275819804200274,
                "y": -1.293806620193

In [None]:
print_json(data['test'])

[
    {
        "trajectory_id": "traj_00032f51796fd5437b238e3a9823d13d_31_5",
        "points": [
            {
                "t": 0.7325868055555556,
                "x": 1.453672014482781,
                "y": 1.62907318656214,
                "v": 0,
                "td": 0.2158854166666666,
                "inc": 0
            },
            {
                "t": 0.7398784722222222,
                "x": 1.419801067341463,
                "y": 1.0112061158041596,
                "v": 84.86328172424379,
                "td": 0.2085937499999999,
                "inc": 0
            },
            {
                "t": 0.7725173611111111,
                "x": 1.4296385850775575,
                "y": 1.031509020876688,
                "v": 0.6912211243436129,
                "td": 0.17595486111111103,
                "inc": 0
            },
            {
                "t": 0.7725173611111111,
                "x": 1.4296385850775575,
                "y": 1.031509020876688,
       

In [None]:
import pandas as pd

df = {
    'train': pd.io.json.json_normalize(data['train']),
    'test': pd.io.json.json_normalize(data['test'])
}

In [None]:
def in_center(x, y):
    return (center['x']['min'] < x) & (x < center['x']['max']) & (center['y']['min'] < y) & (y < center['y']['max'])

In [None]:
final = {
    'train': df['train'][df['train']['max_v.v'] < df['train']['max_v.v'].quantile(0.96)].fillna(0),
    'test': df['test'].fillna(0)
}

In [None]:
for column in final['train'].columns:
    if column[:5] == 'label' and column[6] != 't' and column[6] != 'i':
        final['train'] = final['train'].drop([column], axis=1)

for column in final['test'].columns:
    if column[:5] == 'label' and column[6] != 't' and column[6] != 'i':
        final['test'] = final['test'].drop([column], axis=1)
        
final['train'] = final['train'].drop(['points'], axis=1)
final['test'] = final['test'].drop(['points'], axis=1)

In [None]:
maxval = max(final['train']['max_v.v'])

for column in final['train'].columns:
    if column[-1] == 'v':
        final['train'][column] = final['train'][column] / maxval

for column in final['test'].columns:
    if column[-1] == 'v':
        final['test'][column] = final['test'][column] / maxval

In [None]:
final['train'].to_csv('../results/train_clean.csv', index=False)

In [None]:
final['test'].to_csv('../results/test_clean.csv', index=False)