## Ваш черед

### Новые признаки

Задание творческое - придумайте по новому признаку (группе признаков)
* На основе mcc (tr_type)
* На основе временного фактора
* На основе текстов из описания mcc

Реалиуйте их в функции, аналогичной `gen_features`.

### Все закомментированные команды включены в функцию в самом конце. 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 8)
from pandas import Timestamp, DateOffset

#df_gender = pd.read_csv('data/customers_gender_train.csv')
#df_transactions = pd.read_csv('data/transactions.csv')
#df_tr = pd.read_csv('data/tr_types.csv', sep=';')
#df_mcc = pd.read_csv('data/tr_mcc_codes.csv', sep=';')

def preproc_transactions(df_transactions):
    sec_per_day = 86400
    sec_per_hour = 3600
    
    start_date = 1420070400 - 154 * sec_per_day - 3 * sec_per_hour
    
    df_transactions.loc[:, 'day'] = df_transactions.tr_datetime\
                                               .str.split(' ')\
                                               .str.get(0)\
                                               .astype(int)
    df_transactions.loc[:, 'time_raw'] = df_transactions.tr_datetime\
                                                    .str.split(' ')\
                                                    .str.get(1)

    # set temp dt
    df_transactions.loc[:, 'dt_temp'] = pd.to_datetime(df_transactions.loc[:, 'time_raw'], 
                                                    format='%H:%M:%S')\
                                        + DateOffset(years=115)
    
    df_transactions = df_transactions.assign(dt = lambda x: x.dt_temp.astype(np.int64) // 10**9
                                             + (x.day - 153) * sec_per_day)\
                                     .assign(weekday = lambda x: ((x.day + 4) % 7 + 1))
        
    df_transactions.loc[:, 'datetime'] = pd.to_datetime(df_transactions.dt, unit='s')
    df_transactions.loc[:, 'date'] = df_transactions.loc[:, 'datetime'].dt.strftime('%Y-%m-%d')
    df_transactions.loc[:, 'hour'] = df_transactions.loc[:, 'datetime'].dt.strftime('%H')
    
    df_transactions = df_transactions.drop(['dt_temp', 'time_raw', 'tr_datetime'], axis=1)
    
    df_transactions.loc[:, 'amount'] = np.round(df_transactions.loc[:, 'amount']/(np.pi**np.exp(1)))
            
    return df_transactions


#df_transactions = df_transactions.pipe(preproc_transactions)

In [3]:
def gen_features_1 (df_gender,df_transactions):
    # Сначала посчитаем сумму транзакций по покупателям
    df_tr_counts = df_transactions.pivot_table(index = ['customer_id'], columns='tr_type',values='amount', 
                             aggfunc=np.size, fill_value=0)
       
    df_tr_counts = df_tr_counts.rename_axis(lambda x: 'tr_{}_count'.format(x), axis=1)

    df_transactions['month'] = df_transactions.date\
                                               .str.split('-')\
                                               .str.get(1)\
                                               .astype(int)
    # Теперь добавим процент транзакций по сезонам
    df_month_rations =\
                    df_transactions.pivot_table(index=['customer_id'],columns='month',values='amount', 
                    aggfunc=np.size, fill_value=0)
    
    total = df_month_rations.sum(axis=1)
    df_month_rations.loc[:, 'winter'] = (df_month_rations[[1,2,12]].sum(axis=1).T/total).T
    df_month_rations.loc[:, 'spring'] = (df_month_rations[[3,4,5]].sum(axis=1).T/total).T
    df_month_rations.loc[:, 'summer'] = (df_month_rations[[6,7,8]].sum(axis=1).T/total).T
    df_month_rations.loc[:, 'autumn'] = (df_month_rations[[9,10,11]].sum(axis=1).T/total).T
    
    # Добавим, например, процент оплаченных услуг,магазинов и остального
    # По этим признакам можно отличить мужчину от женщины
    # Пока впихнул первые попавшиеся, дальше будет зависить от конкретной задачи :) 
    df_mcc = pd.read_csv('data/tr_mcc_codes.csv', sep=';')
    df_mcc['mcc_clast'] = 0
    i = 0
    while i<=df_mcc.shape[0]-1:
        if df_mcc[['mcc_description']].values[i][0].find('услуги')>=0:
            df_mcc['mcc_clast'][i] = 1
        if df_mcc[['mcc_description']].values[i][0].find('магаз')>=0:
            df_mcc['mcc_clast'][i] = 2
        i+=1   
    df_mcc_cl = df_mcc.pivot_table(index=['mcc_code'],columns='mcc_clast', 
                        aggfunc=np.size, fill_value=0)
    df_mcc_cl = df_transactions.join(df_mcc_cl, on='mcc_code', how='left').iloc[:,[0,12,13,14]]


    # Объединяем:
    df_features = df_gender.join(df_tr_counts, on='customer_id', how='left')\
                        .join(df_month_rations.loc[:, ['winter', 'spring', 'summer', 'autumn']], on='customer_id', how='left')
    ss = df_mcc_cl.groupby(['customer_id']).sum()
    df_features =  df_features.join(ss,on='customer_id',how='left',sort = True)
        
    return (df_features)
#df_features_1 = df_gender.pipe(gen_features_1, df_transactions)
#label = 'gender'
#idx_features1 = df_features1.columns != label
#X1 = df_features_1.loc[:, idx_features1].values
#y1 = df_features_1.loc[:, ~idx_features1].values.flatten()

## Поиск гиперпараметров

Реализуйте функцию для hyperopt по перебору гипер параметров вашего пайплайна

На всякий случай почитайте еще про [`FeatureUnion`](http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.FeatureUnion.html) и [пример](http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html#sphx-glr-auto-examples-hetero-feature-union-py)

In [4]:
# А это трансформер, который выбирает подможнество столбцов из матрицы X
# Который нужен для того, чтобы делать какие-то действия только для подмноества столбцов, 
# а потом объединять результаты
# Через FeatureUnion

from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, col_idx):
        self.col_idx = col_idx
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[:, self.col_idx]

In [5]:
import hyperopt
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK, rand
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import cross_val_score

def run_trials_template(X, y, params, evals=100):

    def hyperopt_cv(X, y, params):
        
        X_ = X.copy()
        
        # Отделяем параметры лог регрессии в отдельный словарь
        lm_params = {}
        for k, v in params.items():
            if k.startswith('glob'):
                continue                
            elif k.startswith('lm'):
                lm_params[k.split('_', 1)[1]] = v
        
        # Задаем шкалирование
        if params['scaler_type'] == 'standart':
            scaler = StandardScaler(with_mean=params['scaler_centering'])            
        else:
            assert params['scaler_type'] == 'robust'
            scaler = RobustScaler(with_centering=params['scaler_centering'])
        
        # Создаем лог. рег. с нужными параметрами
        clf = LogisticRegression(**lm_params,n_jobs=1)
        
        # Итоговый пайплайн
        # Отделяем не нужный номер покупателя, и разделяем признаки текстовые и все остальные
        
        
        a = list(range(0,78))+ list(range(81,84)) 
        selector1 = ColumnSelector(a)
        selector2 = ColumnSelector(list(range(78,81)))
      
                       
        model = Pipeline([
            ('scaler', FeatureUnion([    
                            ('sel_1',  Pipeline([
                                        ('select_1', selector1),
                                        ('scaler', scaler)
                                                ])),
                
                            ('sel_2', Pipeline([
                                        ('select_2', selector2)
                                        
                                                ]))
                            
                                    ])),
            ('clf', clf)
        ])
                
        
         # Пока применяется scaler на обоих частях датасета
        
        '''model = Pipeline([
                        ('scaler',scaler1),
                        ('clf', clf)
                        ])'''
        
        
        
         # Схема кросс-валидации
        
        n_splits = 5
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, 
                             random_state=RND_SEED)
        scores = cross_val_score(model, X_, y,
                                 scoring='roc_auc', 
                                 cv=cv, 
                                 n_jobs=1)

        # Возвращаем среднее значение метрики и отклонение (на всякий случай)
        return scores.mean(), scores.std()

    def f(params):
        acc, std = hyperopt_cv(X, y, params)
        return {'loss': -acc, 'qscore': -acc, 'qscore_std': std, 'status': STATUS_OK}

    trials = Trials()
    best = fmin(f, 
                params, 
                algo=tpe.suggest, 
                max_evals=evals, 
                trials=trials, 
                verbose=1)
    
    return trials

RND_SEED = 123
space4_lm = {
    'lm_penalty': hp.choice('penalty', ['l1', 'l2']),
    'lm_C': hp.loguniform('C', -5, 3),
    'lm_class_weight': hp.choice('class_weight', [None, 'balanced']),
    'lm_random_state': RND_SEED,
    'scaler_type': hp.choice('scaler_type', ['standart', 'robust']),
    'scaler_centering': hp.choice('scaler_centering', [False, True])
}



def trials_df(trials):
    '''
    Функция форматирует результаты hyperopt в dataframe
    '''
    tr_dict = []
    for t in trials:
        trial = dict()
        for k, v in t['misc']['vals'].items():
            trial[k] = v[0]

        trial['qscore'] = -t['result']['qscore']
        trial['qscore_std'] = -t['result']['qscore_std']
        tr_dict.append(trial)

    df_res = pd.DataFrame.from_dict(tr_dict)
    df_res = df_res.sort_values('qscore', ascending=False)
    
    return df_res
#trials1 = run_trials_template(X1, y1, space4_lm, evals=40)
#df_trials1 = trials_df(trials1)

In [None]:
def Lab6():
    df_gender = pd.read_csv('data/customers_gender_train.csv')
    df_transactions = pd.read_csv('data/transactions.csv')
    df_tr = pd.read_csv('data/tr_types.csv', sep=';')
    df_transactions = df_transactions.pipe(preproc_transactions)
    df_features_1 = df_gender.pipe(gen_features_1, df_transactions)
    label = 'gender'
    idx_features1 = df_features_1.columns != label
    X1 = df_features_1.loc[:, idx_features1].values
    y1 = df_features_1.loc[:, ~idx_features1].values.flatten()
    X1 = X1[:,1:85]
    trials1 = run_trials_template(X1, y1, space4_lm, evals=80)
    df_trials1 = trials_df(trials1)
    return (df_trials1)

In [None]:
df_trials1 = Lab6()
df_trials1[df_trials1['qscore']==df_trials1['qscore'].max()]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
