In [219]:
from deco.imports import *
import json
from pprint import pprint
import sklearn.ensemble as st

In [16]:
PATH = '/home/rotem/Documents/code/optimization-course/'

In [324]:
def extract_weather(weather_obj):
    correct_weather = [item for item in weather_obj['list']
                       if item['sys']['country'] == 'GB' or item['coord']['lat'] == 51.5085][0]
    weather_output = {
        'rain': correct_weather['rain'],
        'snow': correct_weather['snow'],
#         'wind_deg': correct_weather['wind'].get('deg', 0),
        'wind_speed': correct_weather['wind'].get('speed', 0),
        'feels_like': correct_weather['main'].get('feels_like', correct_weather['main']['temp']),
        'humidity': correct_weather['main']['humidity'],
        'pressure': correct_weather['main']['pressure'],
        'temp': correct_weather['main']['temp'],
        'temp_max': correct_weather['main']['temp_max'],
        'temp_min': correct_weather['main']['temp_min'],
        
    }
    weather_output['rain'] = list(weather_output['rain'].values())[0] if weather_output['rain'] is not None else 0
    weather_output['snow'] = list(weather_output['snow'].values())[0] if weather_output['snow'] is not None else 0
    for cloud_key, cloud_value in correct_weather['clouds'].items():
        weather_output['clouds_' + cloud_key] = cloud_value
    return weather_output

In [325]:
def prepare_row(row_json):
    row = json.loads(row_json)
    weather = row.pop('weather')
    weather = extract_weather(weather)
    row.update(weather)
    return row

In [326]:
def load_jsons_to_df(file_name):
    with open(file_name) as F:
        rows = F.readlines()
    data = []
    for row in rows:
        try:
            data.append(prepare_row(row))
        except Exception as e:
            pprint(e)
            pprint(row)
            break
    return pd.DataFrame(data)

In [458]:
df = load_jsons_to_df(PATH + 'data.json')

In [533]:
def enrich_prev_temp(df):
    x = df[['timestamp', 'temp']].drop_duplicates()
    x.timestamp = x.timestamp.str.slice(0, 13)
    x = x.drop_duplicates().sort_values('timestamp').reset_index(drop=True)
    prev_temp = x.join(x[x.index > 0].reset_index(drop=True), lsuffix='_prev')
    prev_d = {}
    for row in prev_temp.values:
        prev_d[row[2]] = prev_d.get(row[2], {})
        prev_d[row[2]][row[3]] = row[1]
    df['prev_temp'] = df.apply(lambda r: prev_d.get(r['timestamp'][:13], {}).get(r['temp'], r['temp']), axis=1)
    return df

In [586]:
def enrich_rain_snow_daily_mean(df):
    x = df.copy()
    x['hour'] = x.timestamp.str.slice(11, 13)
    x.timestamp = x.timestamp.str.slice(0, 10)
    x = x[['timestamp', 'hour', 'rain', 'snow']]
    rain_mean = x.groupby(['timestamp', 'hour'], as_index=False).max().groupby(['timestamp'], as_index=False).mean().values
    rain_mean = {row[0]: (row[1], row[2]) for row in rain_mean}
    df['rain_mean'] = df.timestamp.apply(lambda timestamp: rain_mean[timestamp[:10]][0])
    df['snow_mean'] = df.timestamp.apply(lambda timestamp: rain_mean[timestamp[:10]][1])
    return df

In [604]:
def prepare_df(df):
    df = df.copy()
    df = enrich_prev_temp(df)
    df.from_time = pd.to_datetime(df.from_time)
    df.timestamp = pd.to_datetime(df.timestamp)
    df['sample_diff'] = (df.timestamp - df.from_time).dt.total_seconds()
    attrs = ['Dayofweek', 'Hour', 'Minute']
    for attr in attrs:
        df[f'from_time_{attr.lower()}'] = getattr(df.from_time.dt, attr.lower())
    df = df[[column for column in df.columns if column not in ('origin', 'destination', 'to_time', 'timestamp', 'from_time')]]    
    return df

In [605]:
res = prepare_df(df)
res

Unnamed: 0,duration,rain,snow,wind_speed,feels_like,humidity,pressure,temp,temp_max,temp_min,clouds_all,rain_mean,snow_mean,prev_temp,sample_diff,from_time_dayofweek,from_time_hour,from_time_minute
0,23,0.0,0.0,4.6,3.01,93,1020,7.13,8.33,6.0,22,0.0,0.0,7.13,828.0,0,21,23
1,23,0.0,0.0,4.6,3.01,93,1020,7.13,8.33,6.0,22,0.0,0.0,7.13,108.0,0,21,35
2,23,0.0,0.0,4.6,3.01,93,1020,7.13,8.33,6.0,22,0.0,0.0,7.13,-612.0,0,21,47
3,28,0.0,0.0,4.6,3.01,93,1020,7.13,8.33,6.0,22,0.0,0.0,7.13,108.0,0,21,35
4,23,0.0,0.0,4.6,3.01,93,1020,7.13,8.33,6.0,22,0.0,0.0,7.13,770.0,0,21,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15219,28,0.0,0.0,2.1,4.77,93,1032,7.14,8.00,6.0,75,0.0,0.0,6.89,16.0,3,13,3
15220,27,0.0,0.0,2.1,4.77,93,1032,7.14,8.00,6.0,75,0.0,0.0,6.89,557.0,3,12,59
15221,26,0.0,0.0,2.1,4.77,93,1032,7.14,8.00,6.0,75,0.0,0.0,6.89,17.0,3,13,8
15222,26,0.0,0.0,2.1,4.77,93,1032,7.14,8.00,6.0,75,0.0,0.0,6.89,-463.0,3,13,16


In [623]:
train_df = res.sample(frac=0.8)
test_df = res[~ res.index.isin(train_df.index.values)]
x_train, y_train = train_df.drop('duration', axis=1), train_df.duration
x_test, y_test = test_df.drop('duration', axis=1), test_df.duration

In [564]:
def l1_diff(x, y):
    return abs((x-y)).mean()

In [565]:
def print_score(m):
    res = [l1_diff(m.predict(x_train), y_train), l1_diff(m.predict(x_test), y_test),
                m.score(x_train, y_train), m.score(x_test, y_test)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [624]:
m = st.RandomForestRegressor(n_jobs=-1, n_estimators=40, max_features=0.8, min_samples_leaf=3)
m.fit(x_train, y_train)
print_score(m)

[1.0229741034078716, 1.4653421202876282, 0.907452511029557, 0.8216984039460532]


In [576]:
df

Unnamed: 0,origin,destination,from_time,to_time,duration,timestamp,rain,snow,wind_speed,feels_like,humidity,pressure,temp,temp_max,temp_min,clouds_all,rain_mean,snow_mean
0,490G000779,490G000804,2020-01-06T21:23:00,2020-01-06T21:46:00,23,2020-01-06T21:36:48,0.0,0.0,4.6,3.01,93,1020,7.13,8.33,6.0,22,0.0,0.0
1,490G000779,490G000804,2020-01-06T21:35:00,2020-01-06T21:58:00,23,2020-01-06T21:36:48,0.0,0.0,4.6,3.01,93,1020,7.13,8.33,6.0,22,0.0,0.0
2,490G000779,490G000804,2020-01-06T21:47:00,2020-01-06T22:10:00,23,2020-01-06T21:36:48,0.0,0.0,4.6,3.01,93,1020,7.13,8.33,6.0,22,0.0,0.0
3,490G000779,490G000804,2020-01-06T21:35:00,2020-01-06T22:03:00,28,2020-01-06T21:36:48,0.0,0.0,4.6,3.01,93,1020,7.13,8.33,6.0,22,0.0,0.0
4,490G000779,490G000804,2020-01-06T21:29:00,2020-01-06T21:52:00,23,2020-01-06T21:41:50,0.0,0.0,4.6,3.01,93,1020,7.13,8.33,6.0,22,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15219,490G000779,490G000804,2020-01-23T13:03:00,2020-01-23T13:31:00,28,2020-01-23T13:03:16,0.0,0.0,2.1,4.77,93,1032,7.14,8.00,6.0,75,0.0,0.0
15220,490G000779,490G000804,2020-01-23T12:59:00,2020-01-23T13:26:00,27,2020-01-23T13:08:17,0.0,0.0,2.1,4.77,93,1032,7.14,8.00,6.0,75,0.0,0.0
15221,490G000779,490G000804,2020-01-23T13:08:00,2020-01-23T13:34:00,26,2020-01-23T13:08:17,0.0,0.0,2.1,4.77,93,1032,7.14,8.00,6.0,75,0.0,0.0
15222,490G000779,490G000804,2020-01-23T13:16:00,2020-01-23T13:42:00,26,2020-01-23T13:08:17,0.0,0.0,2.1,4.77,93,1032,7.14,8.00,6.0,75,0.0,0.0
