In [177]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold


import warnings
warnings.filterwarnings('ignore')

In [178]:
df_train_1 = pd.read_csv('data/train_proc.csv', low_memory=False)
df_test_1 = pd.read_csv('data/test_proc.csv',low_memory=False)

In [179]:
folds_generator = GroupKFold()
folds_generator = folds_generator.split(df_train_1, groups=df_train_1.customer_id)
folds = []
for el, er in folds_generator:
    folds += [(el, er)]

In [180]:
useless_columns = ['city', 'country', 'currency', 'customer_id'
                , 'home_add_lat', 'home_add_lon'
                , 'work_add_lat', 'work_add_lon'
                , 'mcc', 'mcc_common'
                , 'near_home', 'near_work', 'terminal_id'
                , 'trans_pos', 'transaction_date'
                , 'tile_n9', 'tile_n15', 'tile_n16', 'tile_n11', 'tile_n14', 'tile_n13', 'tile_n17', 'tile_n12', 'tile_n10']
predictors = set(set(df_train_1.columns) - set(useless_columns))

# пересечём с тестовыми
predictors = list(predictors.intersection(set(df_test_1.columns)))

In [181]:
predictors.sort()

In [182]:
predictors

['all_trans_cnt',
 'amount',
 'client_2_tran_lat_meadian',
 'client_2_tran_lat_mean',
 'client_2_tran_lat_std',
 'client_2_tran_lon_mean',
 'client_2_tran_lon_median',
 'client_2_tran_lon_std',
 'day_of_week',
 'distance2mean',
 'equal_terminals_in_pos',
 'is_weekend',
 'mcc_common_0_freq',
 'mcc_common_10_freq',
 'mcc_common_16_freq',
 'mcc_common_17_freq',
 'mcc_common_18_freq',
 'mcc_common_19_freq',
 'mcc_common_22_freq',
 'mcc_common_26_freq',
 'mcc_common_3_freq',
 'mcc_common_5_freq',
 'mcc_common_6_freq',
 'mcc_common_7_freq',
 'mcc_group_0',
 'mcc_group_10',
 'mcc_group_16',
 'mcc_group_17',
 'mcc_group_18',
 'mcc_group_19',
 'mcc_group_22',
 'mcc_group_26',
 'mcc_group_3',
 'mcc_group_5',
 'mcc_group_6',
 'mcc_group_7',
 'mcc_group_9',
 'near_tile_12_terminal_cnt',
 'near_tile_12_trans_cnt',
 'near_tile_13_terminal_cnt',
 'near_tile_13_trans_cnt',
 'near_tile_14_terminal_cnt',
 'near_tile_14_trans_cnt',
 'near_tile_15_terminal_cnt',
 'near_tile_15_trans_cnt',
 'near_tile_16_t

In [183]:
len(predictors)

55

In [184]:
params = {'eta': 0.1,
          'objective': 'binary:logistic',
          'eval_metric': ['auc'],
          'max_depth': 5,
          'min_child_weight': 1,
          'gamma': 0,
          'subsample': 0.8,
          'colsample_bytree': 0.8,
          'scale_pos_weight': 1,
          'seed': 1234}

In [185]:
# модель для опеределния близости к дому
dtrain = xgb.DMatrix(df_train_1[predictors], df_train_1['near_home'])

In [186]:
# scores = xgb.cv(params, dtrain, num_boost_round=131, folds=folds, verbose_eval=10)

In [187]:
def fit(params, X_train, y_train, predictors, num_boost_round=131, pref="home"):
    if pref=="home":
        trans = X_train['home_add_lat'].dropna().index
        X_train = X_train.loc[trans]
        y_train = y_train.loc[trans]
    else:
        trans = X_train['work_add_lat'].dropna().index
        X_train = X_train.loc[trans]
        y_train = y_train.loc[trans]

    dtrain = xgb.DMatrix(X_train[predictors], y_train)
    model = xgb.train(params, dtrain, num_boost_round=num_boost_round, verbose_eval=10)
    return model

In [188]:
def predict(model, X_test, predictors, pref):
    X_test['proba'] = model.predict(xgb.DMatrix(X_test[predictors]))
    dict_max_proba = X_test.groupby('customer_id')['proba'].max()
    
    # оставляем только те транзакции, которые модель посчитала рядом с тагретом
    X_test['max_proba' + pref] = X_test['customer_id'].map(dict_max_proba)
    X_test = X_test[X_test['proba'] == X_test['max_proba' + pref]]
    
    # расчитываем таргет как медиану
    lat = X_test.groupby('customer_id')['tran_lat'].median()
    lon = X_test.groupby('customer_id')['tran_lon'].median()
    
    lat_name = '_' + str.upper(pref) + '_LAT_'
    lon_name = '_' + str.upper(pref) + '_LON_'
    predictions = pd.merge(lat.to_frame(lat_name), lon.to_frame(lon_name), right_index=True, left_index=True)
    return predictions

In [189]:
def evaluate(df, predictions, pref):
    df = df[df['customer_id'].isin(predictions.index)]
    df = df[['customer_id' ,pref + "_add_lat", pref +"_add_lon"]].drop_duplicates()
    customers = df['customer_id'].value_counts()
    customers = customers[customers == 1].index
    
    df = df[df['customer_id'].isin(customers)]
    df = df.set_index('customer_id')
    res = np.array(df.loc[predictions.index]) - np.array(predictions)
    return np.sum(np.sqrt(res[:, 0]**2 + res[:, 1]**2) < 0.02) / res.shape[0]

In [190]:
model_home = fit(params, df_train_1.iloc[folds[0][0]]
                       , df_train_1['near_home'].iloc[folds[0][0]]
                       , predictors, pref='home')

predictions = predict(model_home, df_train_1.iloc[folds[0][1]], predictors, pref = 'home')
print ('home evaluation metrics ', evaluate(df_train_1, predictions, pref = 'home'))

home evaluation metrics  0.397720455909


In [191]:
model_work = fit(params, df_train_1.iloc[folds[0][0]]
                       , df_train_1['near_work'].iloc[folds[0][0]]
                       , predictors, pref='work')

predictions = predict(model_work, df_train_1.iloc[folds[0][1]], predictors, pref = 'work')
print ('work evaluation_metrics ', evaluate(df_train_1, predictions, pref = 'work'))

work evaluation_metrics  0.149670065987


### Предсказание на тестовых данных

In [192]:
model_home_all = fit(params, df_train_1, df_train_1['near_home'], predictors, pref='home')
preds_home = predict(model_home_all, df_test_1, predictors, pref = 'home')

In [193]:
predictions_h = predict(model_home_all, df_train_1, predictors, pref = 'home')
print ('home evaluation metrics ', evaluate(df_train_1, predictions_h, pref = 'home'))

home evaluation metrics  0.4179


In [194]:
# pd.Series(model_home_all.get_fscore()).sort_values(ascending=False).head(50)

In [195]:
model_work_all = fit(params, df_train_1, df_train_1['near_work'], predictors, pref= 'work')
preds_work = predict(model_work_all, df_test_1, predictors, pref = 'work')

In [196]:
predictions_w = predict(model_work_all, df_train_1, predictors, pref = 'work')
print ('work evaluation metrics ', evaluate(df_train_1, predictions_w, pref = 'work'))

work evaluation metrics  0.1671


In [197]:
def make_submition(preds_home, preds_work):
    df_pr = pd.merge(preds_work,preds_home, left_index=True, right_index=True)
    submit = pd.read_csv("data/sample.csv")
    df_pr = df_pr.loc[submit['customer_id']].reset_index()
    df_pr.columns = ['_ID_', '_WORK_LAT_','_WORK_LON_','_HOME_LAT_','_HOME_LON_']
    df_pr.fillna(0, inplace = True)
    return df_pr

In [198]:
predictions_train = pd.concat([predictions_h, predictions_w], axis = 1)

In [199]:
df_pr = make_submition(preds_home, preds_work)
df_pr['diff'] = np.sqrt((df_pr['_WORK_LAT_']- df_pr['_HOME_LAT_'])**2 +
                           (df_pr['_WORK_LON_']- df_pr['_HOME_LON_'])**2)

In [200]:
df_pr.head()

Unnamed: 0,_ID_,_WORK_LAT_,_WORK_LON_,_HOME_LAT_,_HOME_LON_,diff
0,00021683ccb416637fe9a4cd35e4606e,55.028,82.913,55.037037,82.978493,0.066114
1,0002d0f8a642272b41c292c12ab6e602,55.739838,37.394714,53.199818,50.173374,13.028655
2,0004d182d9fede3ba2534b2d5e5ad27e,43.585,39.723,43.588963,39.727375,0.005903
3,0008c2445518c9392cb356c5c3db3392,51.529,46.019,51.529017,46.029402,0.010402
4,000b373cc4969c0be8e0933c08da67e1,56.248314,43.464493,56.233054,43.457591,0.016749


In [201]:
df_pr['customer_id'] = df_pr['_ID_']
df_pr = df_pr.set_index('customer_id')

In [203]:
customer = '0002d0f8a642272b41c292c12ab6e602'
pr_h,pr_w = get_prediction(customer, df_pr)
print (pr_h)
print (pr_w)
pm.plot_one_person(customer, df_test_1, predicted_home = pr_h, predicted_work = pr_w)

(53.200317700000006, 50.173874000000005)
(55.739337800000001, 37.394213700000002)
        home_add_lat  home_add_lon
261057           0.0           0.0
        work_add_lat  work_add_lon
261057           0.0           0.0


In [172]:
import imp
import plot_maps as pm
imp.reload(pm)

<module 'plot_maps' from 'C:\\kaggle\\raif\\plot_maps.py'>

In [162]:
predictions_train.tail(10)

Unnamed: 0_level_0,_HOME_LAT_,_HOME_LON_,_WORK_LAT_,_WORK_LON_
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ff70c3c5f2dca00f255705305d75111d,59.738523,30.404776,59.931,30.305
ff71bdbcba59047f1fad88dcb7052151,55.646289,37.732466,55.734999,37.752592
ff7a887d347a8d598dc8e559d3aaec2f,56.360561,43.823411,56.317916,43.925425
ff7e1c6c07469b28a07847540385e767,55.638199,37.667647,55.74105,37.609963
ff869ee855dc3f9b382c943eb43cc4ec,55.088369,36.662918,55.124,36.622
ff92d5420f5fb92a37e1280d1fc9e5f4,56.509819,56.081608,56.509819,56.081608
ffaeae55d4dbf29058f04e7a6a764f02,55.678683,37.853583,55.725538,37.664475
ffc5289194413ec68c3f7adc8121d69b,55.889313,37.472749,55.731569,37.698358
ffd097949a4a238296a7deadfb376cc0,55.747,37.707,55.785,37.666
ffdd5ec2a90e355cf40525eac1a6fd34,55.934117,37.520084,55.800559,37.596448


In [159]:
def get_prediction(customer_id, predictions):
    row = predictions[predictions.index== customer_id]
    pred_home = (row.iloc[0]['_HOME_LAT_'] + 0.0005,row.iloc[0]['_HOME_LON_'] + 0.0005)
    pred_work = (row.iloc[0]['_WORK_LAT_'] - 0.0005,row.iloc[0]['_WORK_LON_'] - 0.0005)
    return pred_home,pred_work 

In [214]:
tr = df_train_1[df_train_1.customer_id == 'ffc5289194413ec68c3f7adc8121d69b']
tr = tr[['terminal_id','tran_lat','tran_lon', 'mcc_common']]

In [215]:
#tr.groupby(['terminal_id'])['terminal_id'].count().reset_index(name = 'cnt')

In [217]:
customer = 'ffd097949a4a238296a7deadfb376cc0'
pr_h,pr_w = get_prediction(customer, predictions_train)
print (pr_h)
print (pr_w)

pm.plot_one_person(customer, df_train_1, predicted_home = pr_h, predicted_work = pr_w)

(55.747500000000002, 37.707500000000003)
(55.784499999999994, 37.665499999999994)
      home_add_lat  home_add_lon
9875        55.796        37.740
9876        55.796        37.745
      work_add_lat  work_add_lon
9875        55.754        37.526


In [None]:
df_pr.to_csv('predictions/pred_26-02-18-3.csv', index = None)

In [None]:
df_test_1[df_test_1.customer_id == '6d8045c0917ec26242fd039d2f0c1dec'][['tran_lat', 'tran_lon']]

In [None]:
df = pd.read_csv('predictions/pred_26-02-18-3.csv')
df.head(20)

In [None]:
df = pd.read_csv('predictions/pred_26-02-18-2.csv')
df