In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import pyrosm
import json

from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings("ignore")


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.set_option('display.max_columns', None)

In [3]:
train_orders = pd.read_csv('orders.csv')
train_orders

Unnamed: 0,Id,running_time,completed_time,route_distance_km,delta_time
0,-4773019581999572651,2022-01-24 18:30:21,2022-01-24 18:44:43,3.740,862.0
1,-7575630690398473489,2022-01-24 06:53:53,2022-01-24 07:06:26,3.526,753.0
2,-6264582368520213833,2022-01-24 10:00:59,2022-01-24 10:15:58,5.071,899.0
3,5964315354301636538,2022-01-24 14:28:05,2022-01-24 14:35:08,2.867,423.0
4,1372379574816145639,2022-01-24 11:57:29,2022-01-24 12:06:29,3.751,540.0
...,...,...,...,...,...
4995,7096714159023973792,2022-01-24 21:10:38,2022-01-24 21:23:53,7.397,795.0
4996,-3836026425968071806,2022-01-24 15:10:27,2022-01-24 15:20:21,1.948,594.0
4997,2926216435675216636,2022-01-24 13:57:04,2022-01-24 14:03:18,2.547,374.0
4998,-6677307480063489707,2022-01-24 08:46:13,2022-01-24 08:55:34,3.013,561.0


In [4]:
nodes = pd.read_csv('nodes_train.csv')
nodes

Unnamed: 0,Id,node_start,node_finish,distance,speed
0,-2627062893189810184,10980432,2133368107,17.414917,32.0
1,-2627062893189810184,10980433,5212387954,17.186539,26.0
2,-2627062893189810184,10980445,5221700954,28.513481,26.0
3,-2627062893189810184,10980498,10980445,154.266122,25.0
4,-2627062893189810184,10980647,1986137911,8.542824,29.0
...,...,...,...,...,...
480286,-8229597404562288405,8952439761,317189358,4.847930,23.0
480287,-8229597404562288405,8952439762,8952439761,131.325685,26.0
480288,-8229597404562288405,8952439763,8952439762,44.026544,38.0
480289,-8229597404562288405,8952439764,8952439763,0.111226,34.0


In [5]:
train_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Id                 5000 non-null   int64  
 1   running_time       5000 non-null   object 
 2   completed_time     5000 non-null   object 
 3   route_distance_km  5000 non-null   float64
 4   delta_time         5000 non-null   float64
dtypes: float64(2), int64(1), object(2)
memory usage: 195.4+ KB


In [6]:
nodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480291 entries, 0 to 480290
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Id           480291 non-null  int64  
 1   node_start   480291 non-null  int64  
 2   node_finish  480291 non-null  int64  
 3   distance     480291 non-null  float64
 4   speed        476307 non-null  float64
dtypes: float64(2), int64(3)
memory usage: 18.3 MB


## Geodata

In [7]:
osm = pyrosm.OSM(filepath='../odessa_oblast-latest.osm.pbf')
drive_net = osm.get_network(network_type="driving", nodes=True)

geodata = drive_net[1]

AttributeError: module 'numpy' has no attribute 'float'.
`np.float` was a deprecated alias for the builtin `float`. To avoid this error in existing code, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [None]:
geodata.info()

In [None]:
drive_net[1]

In [11]:
unique_start_nodes = pd.DataFrame(nodes['node_start'].unique(), columns=['unique_start_nodes'])
unique_finish_nodes = pd.DataFrame(nodes['node_finish'].unique(), columns=['unique_finish_nodes'])

In [13]:
lats_start = unique_start_nodes.merge(geodata,
                         left_on='unique_start_nodes',
                         right_on='u')['geometry'].apply(np.array).apply(lambda x: x[0, 1])

longs_start = unique_start_nodes.merge(geodata,
                         left_on='unique_start_nodes',
                         right_on='u')['geometry'].apply(np.array).apply(lambda x: x[0, 0])

lats_finish = unique_finish_nodes.merge(geodata,
                         left_on='unique_finish_nodes',
                         right_on='v')['geometry'].apply(np.array).apply(lambda x: x[0, 1])

longs_finish = unique_finish_nodes.merge(geodata,
                         left_on='unique_finish_nodes',
                         right_on='v')['geometry'].apply(np.array).apply(lambda x: x[0, 0])

TypeError: 'LineString' object is not subscriptable

In [12]:
plt.scatter(longs_start, lats_start, s=0.005, c='red')

NameError: name 'longs_start' is not defined

In [None]:
plt.scatter(longs_finish, lats_finish, s=0.005, c='blue')

In [None]:
unique_nodes = nodes.groupby(['node_start', 'node_finish']).count().reset_index()[['node_start', 'node_finish']]
unique_nodes_geodata = unique_nodes.merge(geodata, left_on=['node_start', 'node_finish'], right_on=['u', 'v'])

In [None]:
# need to do something with data shit
unique_nodes_geodata = pd.concat((unique_nodes_geodata,
                                  unique_nodes.merge(geodata, left_on=['node_finish', 'node_start'], right_on=['u', 'v'])),
                                 axis=0).reset_index()

In [None]:
unique_nodes_geodata

In [None]:
def parse(keys, dataframe):
    for key in keys:
        dataframe[key] = None

In [None]:
unique_nodes_geodata['tags'] = unique_nodes_geodata['tags'].apply(lambda row: {} if row == None else json.loads(row))

In [None]:
unique_nodes_geodata['tags'].apply(lambda row: parse(row.keys(), unique_nodes_geodata))

In [None]:
unique_nodes_geodata

In [None]:
for i in tqdm(unique_nodes_geodata.index):
    for key in unique_nodes_geodata['tags'].loc[i].keys():
        unique_nodes_geodata[key].loc[i] = unique_nodes_geodata['tags'].loc[i][key]

In [None]:
# we need to drop some columns here...
unique_nodes_geodata

In [None]:
nodes_and_geodata = nodes.merge(unique_nodes_geodata,
                                left_on=['node_start', 'node_finish'],
                                right_on=['node_start', 'node_finish'])

In [None]:
# ...or here
nodes_and_geodata

In [None]:
nodes_and_geodata.shape

In [None]:
nodes_and_geodata.info(verbose=1, null_counts=True)

In [None]:
valid_features_count = pd.DataFrame((nodes_and_geodata.groupby(['node_start', 'node_finish']).count().reset_index() != 0).sum())

In [None]:
plt.hist(valid_features_count[0], bins=10)

In [None]:
threshold = 5000

In [None]:
drop_features = list(valid_features_count[valid_features_count[0]<threshold].index)

In [None]:
# geometry parse

In [None]:
drop_features.extend(['node_start', 'node_finish', 'index',
 'name', 'id', 'timestamp', 'version',
 'tags', 'osm_type', 'u', 'v', 'length',
 'name:en', 'name:ru', 'name:uk'])
drop_features.extend(['geometry'])

In [None]:
nodes_and_geodata.drop(drop_features, axis=1, inplace=True)

In [None]:
def fill_missing(data):
    # Oneway
    data['oneway'] = data['oneway'].fillna('no')
    
    # Lanes
    data['lanes'] = data['lanes'].apply(lambda row: int(row) if row is not None else row)
    
    cross_no = pd.crosstab(data[data['oneway'] == 'no']['lanes'], data[data['oneway'] == 'no']['highway'])
    cross_yes = pd.crosstab(data[data['oneway'] == 'yes']['lanes'], data[data['oneway'] == 'yes']['highway'])

    for col in tqdm(cross_no.columns):
        lanes = cross_no[col].argmax() + 1
        data.loc[data.lanes.isnull() & (data.highway == col) & (data.oneway == 'no'), 'lanes'] = lanes

    for col in tqdm(cross_yes.columns):
        lanes = cross_yes[col].argmax() + 1
        data.loc[data.lanes.isnull() & (data.highway == col) & (data.oneway == 'yes'), 'lanes'] = lanes
        
    data['lanes'] = data['lanes'].fillna(data['lanes'].mode()[0])
    
    data['lanes'] = data['lanes'].apply(int)
    
    # Surface
    data['surface'] = data['surface'].fillna(data['surface'].mode()[0])
    
    # Speed
    data['speed'] = data['speed'].fillna(data['speed'].median())
        
    return data

In [None]:
nodes_and_geodata = fill_missing(nodes_and_geodata)

In [None]:
onehot = ['lanes', 'highway', 'oneway', 'surface']

In [None]:
nodes_and_geodata = pd.get_dummies(nodes_and_geodata, columns=onehot)

In [None]:
train_orders = train_orders.merge(nodes_and_geodata[['Id', 'speed']].groupby('Id').mean(),
                                  left_on="Id",
                                  right_on="Id", how='left')

In [12]:
sum_nodes_and_geodata = nodes_and_geodata.drop('speed', axis=1).groupby('Id').sum()

NameError: name 'nodes_and_geodata' is not defined

In [13]:
train_orders = train_orders.merge(sum_nodes_and_geodata, left_on="Id", right_on="Id", how='left')

NameError: name 'sum_nodes_and_geodata' is not defined

In [14]:
train_orders.fillna(0, inplace=True)

In [15]:
train_orders

Unnamed: 0,Id,running_time,completed_time,route_distance_km,delta_time
0,-4773019581999572651,2022-01-24 18:30:21,2022-01-24 18:44:43,3.740,862.0
1,-7575630690398473489,2022-01-24 06:53:53,2022-01-24 07:06:26,3.526,753.0
2,-6264582368520213833,2022-01-24 10:00:59,2022-01-24 10:15:58,5.071,899.0
3,5964315354301636538,2022-01-24 14:28:05,2022-01-24 14:35:08,2.867,423.0
4,1372379574816145639,2022-01-24 11:57:29,2022-01-24 12:06:29,3.751,540.0
...,...,...,...,...,...
4995,7096714159023973792,2022-01-24 21:10:38,2022-01-24 21:23:53,7.397,795.0
4996,-3836026425968071806,2022-01-24 15:10:27,2022-01-24 15:20:21,1.948,594.0
4997,2926216435675216636,2022-01-24 13:57:04,2022-01-24 14:03:18,2.547,374.0
4998,-6677307480063489707,2022-01-24 08:46:13,2022-01-24 08:55:34,3.013,561.0


In [16]:
X = train_orders.drop(['delta_time', 'Id', 'running_time', 'completed_time'], axis=1)
y = train_orders['delta_time']

In [17]:
split = 0.2
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=split,
                                                    shuffle=True,
                                                    random_state=69)

In [18]:
#transformer = Pipeline(steps=[('scaler', StandardScaler())])
#X_train = transformer.fit_transform(X_train)
#X_test = transformer.transform(X_test)

In [None]:
boost_grid = {'n_estimators': [100, 150, 200, 500, 1000],
        'max_depth': [2, 4, 8, 12],
        'learning_rate': [0.05, 0.1, 0.2, 0.25]}

regressor = XGBRegressor(tree_method='hist', random_state=69)

model = GridSearchCV(estimator=regressor, param_grid=boost_grid, n_jobs=-1, cv=None)

In [None]:
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)
print(mean_absolute_error(predictions, y_test))
print(np.sqrt(mean_squared_error(predictions, y_test)))

In [None]:
print(model.best_params_)

In [None]:
RF_grid = {'n_estimators': [50, 100, 150, 200, 250, 300],
        'max_depth': [4, 6, 8, 10, 12]}

regressor = RandomForestRegressor()
model = GridSearchCV(estimator=regressor, param_grid=RF_grid, cv=None)

In [None]:
model.fit(X_train, y_train)

In [None]:
regressor = LGBMRegressor(random_state=69)

model = GridSearchCV(estimator=regressor, param_grid=boost_grid, n_jobs=-1, cv=None)

In [None]:
X_train.rename

In [None]:
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)
print(mean_absolute_error(predictions, y_test))
print(np.sqrt(mean_squared_error(predictions, y_test)))