### Imports

In [1]:
import re
from math import sin, cos, sqrt, atan2, radians

import lightgbm as lgb
import numpy as np
import pandas as pd
import usaddress
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from tqdm import tqdm_notebook
from target_encoding import TargetEncoder


### Use functions

In [2]:
def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

def distance(x, y):
    """
    Параметры
    ----------
    x : tuple, широта и долгота первой геокоординаты 
    y : tuple, широта и долгота второй геокоординаты 
    
    Результат
    ----------
    result : дистанция в километрах между двумя геокоординатами
    """
    R = 6373.0 # радиус земли в километрах
    lat_a, long_a, lat_b, long_b = map(radians, [*x, *y])    
    dlon = long_b - long_a
    dlat = lat_b - lat_a
    a = sin(dlat/2)**2 + cos(lat_a) * cos(lat_b) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c


def get_address(text):
    text = str(text).lower()
    text = re.sub(r"[^A-Za-zА-Яа-я]+", ' ', text)
    try:
        tag = usaddress.tag(text)[0]
        res = tag.get('PlaceName')
    except:
        res = None
    if res is None:
        res = text.split()[-1]
    return res


In [3]:
class DataReader:
    def __init__(self, path, is_train = True):
        self.path = path
        self.is_train = is_train
        
        self.x = pd.read_csv(self.path, index_col=0)
        
        if self.is_train:
            self.y = self.x['target']
            self.x = self.x.drop('target', axis=1)
            
        self.columns = self.x.columns
        self.index = self.x.index
        self.len = len(self.x)
        
    def get_x(self, i):
        assert i in self.index, 'Bad Index'
        
        return self.x.loc[i]
    
    def get_x_col(self, i, col):
        assert i in self.index, 'Bad Index'
        assert col in self.columns, 'Bad Columns'
        
        return self.get_x(i)[col]
    
    def get_y(self, i):
        assert i in self.index, 'Bad Index'
        
        if self.is_train:
            return self.y.loc[i]
        else:
            return None
        
    def add_columns(self, name, data):
        assert name not in self.columns, 'Columns name exist'
        assert len(data) == self.len, 'Len of data must be equal to dataset'
        
        self.x[name] = data
        self.columns = self.x.columns


In [4]:
my_rmse = make_scorer(rmse, greater_is_better=False)

### Read data

In [5]:
train = DataReader('../input/train.csv', True)
test = DataReader('../input/test.csv', False)


### Feature engineering

In [6]:
for data in [train, test]:
    new_column = 'city_eng'
    new_data = data.x['address'].apply(get_address)
    data.add_columns(new_column, new_data)


### Distance to centres

In [7]:
cities = {
    'Москва': [55.753215, 37.622504],
    'Питер': [59.939095, 30.315868],
    'Новосибирск': [55.030199, 82.920430],
    'Екатеринбург':[56.838011, 60.597465],
    'Краснодар':[45.035470, 38.975313],
    'Владивосток':[43.115536, 131.885485],
    'Еманжелинск':[54.752147, 61.317223],
    'Якутск':[62.028103, 129.732663],
}


In [8]:
use_columns = ['atm_group', 'city_eng']
for city in tqdm_notebook(cities.keys()):
    for data in [train, test]:
        new_column1 = 'dist_to_city_{}'.format(city)
        new_data1 = data.x.apply(lambda x: distance(cities[city], x[['lat', 'long']]), axis=1)
        data.add_columns(new_column1, new_data1)
        
        new_column2 = 'labels_dist_to_city_{}'.format(city)
        maximum = new_data1.max() / 5
        new_data2 = data.x[new_column1] // maximum
        data.add_columns(new_column2, new_data2)
        
    use_columns.append(new_column2)


HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




### Target encoding

In [9]:
for i in range(6, 23, 4):
    for data in [train, test]:
        new_column = 'lat_round_{}'.format(i)
        new_data = data.x['lat'].fillna(0).apply(lambda x: int(i*x))
        data.add_columns(new_column, new_data)
    use_columns.append(new_column)
    
for i in [1.6, 1.8, 2, 2.2, 2.4, 2.6]:
    for data in [train, test]:
        new_column = 'long_round_{}'.format(i)
        new_data = data.x['long'].fillna(0).apply(lambda x: int(i*x))
        data.add_columns(new_column, new_data)
    use_columns.append(new_column)


In [None]:
for col in use_columns:
    encod = TargetEncoder(col)
    encod.fit(train.x[[col]], y=train.y)
    
    new_column = '{}_enc'.format(col)
    for data in [train, test]:
        new_data = encod.predict(data.x[[col]])
        data.add_columns(new_column, new_data)


### Поиск модели

In [None]:
X_train = train.x.drop([
    'address',
    'address_rus',
    'id',
    'city_eng',
], axis=1)
Y_train = train.y


In [None]:
gridParams = {
    'num_leaves': np.arange(4,30, 5),
    'colsample_bytree': np.arange(0.3, 0.8, 0.1),
    'subsample' : np.arange(0.3, 0.8, 0.1),
    'reg_alpha' : np.arange(0.1, 1.5, 0.2),
    'reg_lambda' : np.arange(0.1, 1.5, 0.2),
}

gbm = lgb.LGBMRegressor(
    boosting_type= 'gbdt',
    objective = 'regression',
    n_estimators=500,
    learning_rate=0.01,
    random_state=42,
)

cv = KFold(
    n_splits=3, 
    shuffle=True,
    random_state=42,
)

grid = GridSearchCV(
    gbm, gridParams,
    verbose=3,
    cv=cv,
    scoring=my_rmse,
)

grid.fit(X_train, Y_train)

best = grid.best_params_

Fitting 3 folds for each of 7350 candidates, totalling 22050 fits
[CV] colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.1, subsample=0.3 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.1, subsample=0.3, score=-0.04567233033608959, total=   1.7s
[CV] colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.1, subsample=0.3 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s


[CV]  colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.1, subsample=0.3, score=-0.047212313913326504, total=   1.3s
[CV] colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.1, subsample=0.3 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.1s remaining:    0.0s


[CV]  colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.1, subsample=0.3, score=-0.04692980592151135, total=   1.5s
[CV] colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.1, subsample=0.4 
[CV]  colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.1, subsample=0.4, score=-0.04567233033608959, total=   1.7s
[CV] colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.1, subsample=0.4 
[CV]  colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.1, subsample=0.4, score=-0.047212313913326504, total=   1.4s
[CV] colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.1, subsample=0.4 
[CV]  colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.1, subsample=0.4, score=-0.04692980592151135, total=   1.3s
[CV] colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.1, subsample=0.5 
[CV]  colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.1, subsample=0.5, score=-0.04567233033608959, total=   1.4s
[

[CV]  colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.5000000000000001, subsample=0.5, score=-0.0456796416108388, total=   1.5s
[CV] colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.5000000000000001, subsample=0.5 
[CV]  colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.5000000000000001, subsample=0.5, score=-0.04721984430731069, total=   2.0s
[CV] colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.5000000000000001, subsample=0.5 
[CV]  colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.5000000000000001, subsample=0.5, score=-0.04692931665268946, total=   1.5s
[CV] colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.5000000000000001, subsample=0.6000000000000001 
[CV]  colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.5000000000000001, subsample=0.6000000000000001, score=-0.0456796416108388, total=   1.2s
[CV] colsample_bytree=0.3, num_leaves=4, reg_alpha=0.1, reg_lambda=0.5000000000000001,

In [None]:
print(best)

In [None]:
params = {
    'max_depth': int(best['max_depth']),
    'num_leaves': int(best['num_leaves']),
    'colsample_bytree': best['colsample_bytree'],
    'subsample': best['subsample'],
    'reg_alpha': best['reg_alpha'],
    'reg_lambda': best['reg_lambda'],      
}


In [None]:
gbm = lgb.LGBMRegressor(
    objective = 'regression',
    n_estimators=500,
    learning_rate=0.01,
    random_state=42,
    **params,
)


In [None]:
cv = KFold(
    n_splits=5, 
    shuffle=True,
    random_state=42,
)

scores = cross_val_score(
    gbm, X_train, Y_train,
    scoring=my_rmse,
    cv=cv,
)

score = scores.mean()
print(score)

gbm.fit(X_train, Y_train)


In [None]:
X_test = test.x.drop([
    'address',
    'address_rus',
    'id',
    'city_eng',
], axis=1)


In [None]:
submit = pd.DataFrame(gbm.predict(X_test), index=test.index, columns=['target'])


In [None]:
submit.to_csv('../submits/submit_{}.csv'.format(int(score*10e4)))
