In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter

from haversine import haversine # too slow
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans

from catboost import CatBoostRegressor
from catboost import cv
from catboost import Pool

### Загрузка и объединение трейна и валидации

In [None]:
!ls ../../

In [None]:
train_df = pd.read_csv('../../data/train.csv')
val_df = pd.read_csv('../../data/validation.csv')

In [None]:
train_df = pd.concat([train_df, val_df])

In [None]:
train_df.head()

In [None]:
city_dict = { 
    338: "Краснодар",
    22394: "Тольятти",
    22402: "Уфа",
    22406: "Екатеринбург"
}

train_df['city'] = train_df.main_id_locality.apply(lambda x: city_dict[x])

In [None]:
plt.scatter(train_df['latitude'], train_df['longitude'])

In [None]:
def train_pca(df):
    coords = np.vstack((df[['latitude', 'longitude']].values,
                    df[['del_latitude', 'del_longitude']].values))
    pca = PCA()
    pca.fit(coords)
    
    return pca

In [None]:
def clusterize(df, n_clusters=100, batch_size=10000, sample_size=500000):
    coords = np.vstack((df[['latitude', 'longitude']].values,
                    df[['del_latitude', 'del_longitude']].values))
    
    sample_ind = np.random.permutation(len(coords))[:sample_size]
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size).fit(coords[sample_ind])
    
    return kmeans

In [None]:
def create_features(df, features_to_use, pca, kmeans, train=False):
    # time features
    df['OrderedDate_datetime'] = pd.to_datetime(df['OrderedDate'])
    df['month'] = df['OrderedDate_datetime'].dt.month
    df['hour'] = df['OrderedDate_datetime'].dt.hour
    df['week_of_year'] = df['OrderedDate_datetime'].dt.weekofyear
    df['day_of_year'] = df['OrderedDate_datetime'].dt.dayofyear
    df['day_of_week'] = df['OrderedDate_datetime'].dt.dayofweek
    
    # geo features
    df['haversine'] = df.apply(lambda row: haversine((row['latitude'], row['longitude']), 
                                                     (row['del_latitude'], row['del_longitude'])), axis=1)
    
    # maneuvers
#     df['n_turns'] = df['step_maneuvers'].apply(lambda s: Counter(s.split('|'))['turn'])
    
#     df['n_left_directions'] = df['step_direction'].apply(lambda s: Counter(s.split('|'))['left'])
#     df['n_right_directions'] = df['step_direction'].apply(lambda s: Counter(s.split('|'))['right'])
    
    # PCA features
    pickup_pca_features = pca.transform(df[['latitude', 'longitude']])
    df['pickup_pca0'] = pickup_pca_features[:, 0]
    df['pickup_pca1'] = pickup_pca_features[:, 1]
    
    dropoff_pca_features = pca.transform(df[['del_latitude', 'del_longitude']])
    df['dropoff_pca0'] = dropoff_pca_features[:, 0]
    df['dropoff_pca1'] = dropoff_pca_features[:, 1]
    
    # kmeans features
    df['pickup_cluster'] = kmeans.predict(df[['latitude', 'longitude']])
    df['dropoff_cluster'] = kmeans.predict(df[['del_latitude', 'del_longitude']])
    
    
    if train:
        return df[features_to_use + ['RTA']]
    
    return df[features_to_use]

In [None]:
features_to_use = ['main_id_locality', 'ETA','month', 'hour', 'week_of_year',
                   'day_of_year', 'day_of_week', 'haversine', 
                   'pickup_pca0', 'pickup_pca1', 'dropoff_pca0', 'dropoff_pca1', 
                   'pickup_cluster', 'dropoff_cluster']

categorical_features = ['main_id_locality',
                        'month', 'hour', 'week_of_year', 'day_of_week',
                        'pickup_cluster', 'dropoff_cluster']

In [None]:
pca = train_pca(train_df)

In [None]:
kmeans = clusterize(train_df)

In [None]:
train_df = create_features(train_df, features_to_use, pca, kmeans, True)
print(train_df.shape)

In [None]:
train_df.head()

### Добавляем фичи маршрута

In [None]:
train_df_extended = pd.read_csv("../../data/train_extended.csv", index_col=None)

In [None]:
train_df_extended.shape

In [None]:
valid_df_extended = pd.read_csv("../../data/valid_extended.csv", index_col=None)

In [None]:
extended_route_features = pd.concat([train_df_extended, valid_df_extended], axis=0)

In [None]:
extended_route_features.shape

In [None]:
train_df = pd.concat([train_df, extended_route_features], axis=1)
train_df.shape

### Обучение

In [None]:
X = train_df.drop('RTA', axis=1)
y = np.log(train_df['RTA'])

In [None]:
categorical_features_indicies = [features_to_use.index(feat) for feat in categorical_features]

In [None]:
model = CatBoostRegressor(loss_function='MAPE')
model.fit(
    X, y,
    cat_features=categorical_features_indicies,
    early_stopping_rounds=10,
    verbose=False,
    plot=True
);

In [None]:
importances = model.get_feature_importance(prettified=True)
print(importances)

In [None]:
test_df = pd.read_csv('../../data/test.csv')

In [None]:
test_df_extended = pd.read_csv('../../data/test_extended.csv')

In [None]:
test_df = create_features(test_df, features_to_use, pca, kmeans, False)

In [None]:
test_df = pd.concat([test_df, test_df_extended], axis=1)

In [None]:
test_df

In [None]:
test_df.loc[0, :]["main_id_locality"]

In [None]:
models = {}
for p in test_df["main_id_locality"].unique():
    models[p] = model

In [None]:
from tqdm import tqdm_notebook

In [None]:
predicts = []
for i in tqdm_notebook(range(len(test_df))):
    p = models[test_df.loc[i, :]["main_id_locality"]].predict(test_df[i:i+1])
    predicts.append(p)

In [None]:
test_df['predict'] = np.exp(model.predict(test_df))

In [None]:
test_df = test_df.reset_index()

In [None]:
test_df = test_df.rename(columns={'index':'Id', 'predict':'Prediction'})

In [None]:
!ls ../submission/

In [None]:
test_df[['Id', 'Prediction']].to_csv('../submission/submission_2.csv', sep=',', index=False, header=True)

In [None]:
!head ../submission/submission_2.csv

In [None]:
!pwd 

In [None]:
test_df.loc[0, ]

In [None]:
np.array(test_df.loc[0, ].values)

In [None]:
for idx, row in test_df.iterrows():
    print(model.predict(row))