## Генерация новых признаков.
- Простые признаки
- Признаки на основе информации всех данных одного пользователя
- Статистические признаки
- Признаки-отношения

In [1]:
import pandas as pd
import numpy as np
import os
import datetime
from matplotlib import pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)

In [2]:
PATH_TO_DATA = 'data/'
train = pd.read_csv(os.path.join(PATH_TO_DATA, 'onetwotrip_challenge_train.csv'))
test = pd.read_csv(os.path.join(PATH_TO_DATA, 'onetwotrip_challenge_test.csv'))

full = pd.concat([train, test], join='inner')

### 1.  Переименуем начальные признаки и добавим новые простые признаки.

Новые простые фичи:
- number_all_buy - количество всех покупок одного человека
- ticket_price - средняя цена одного билета по одной покупке
- full_price_int - целочисленная цена покупки
- flight_duration_int - целочисленное время полета

In [3]:
unique14 = np.unique(full['field14'])
new_unique14 = np.round(unique14 / 0.14006639 - 0.27860731).astype(int)
dict14 = {unique14[i]: new_unique14[i] for i in range(len(unique14))}

unique1 = np.unique(full['field1'])
new_unique1 = np.round(unique1  / 0.077571 + 0.0765905).astype(int)
dict1 = {unique1[i]: new_unique1[i] for i in range(len(unique1))}

def new_simple_features(df):
    
    df.rename(columns={'field0': 'delta_buy_buy',
                       'field1': 'full_price_scaled',
                       'field2': 'buy_month',
                       'field3': 'fly_month', 
                       'field4': 'buy_number', 
                       'field5': 'reg_ind', 
                       'field9': 'number_tickets_children_1',
                       'field11': 'fly_hour',
                       'field14': 'flight_duration_scaled',
                       'field15': 'number_tickets',
                       'field16': 'delta_fly_buy',
                       'field18': 'buy_weekday',
                       'field20': 'fly_weekday',
                       'field21': 'buy_year',
                       'field24': 'number_tickets_adult',
                       'field28': 'number_tickets_children_2',
                       'field29': 'buy_quarter'
                      }, inplace=True)
    
    flight_quantity = dict(df['userid'].value_counts())
    df['number_all_buy'] = df['userid'].apply(lambda x: flight_quantity[x])
    df['ticket_price'] = df['full_price_scaled'] / df['number_tickets']
    df['full_price_int'] = df['full_price_scaled'].apply(lambda x: dict1[x])
    df['flight_duration_int'] = df['flight_duration_scaled'].apply(lambda x: dict14[x])
    
new_simple_features(train)
new_simple_features(test)

### 2. Теперь для каждого пользователя посчитаем новые признаки  
Временные признаки (user_time_features):
- cumsum_delta_buy_buy - вспомогательный признак. Накопительный признак delta_buy_buy.
- delta_fly - количество дней с момента покупки до вылета.
- quantity_buy_one_day - количество вылетов пользователя в день вылета (+ вчерашние и завтрашние) по покупке.
- mean_quantity_buy_one_day - среднее количество вылетов в день для одного пользователя.
- quantity_buy_one_day_averaged - quantity_buy_one_day усредненное по mean_quantity_buy_one_day
- delta_fly_fly - дельта между датой полета по покупке и предыдущему полету

Статистические признаки (user_statistical_features):  
- Для большого количества признаков считаем mean и std одного пользователя.

Признаки-отношения (new_ratio_features):  
по формуле log(x + 1) - log(y + 1)
- ratio_delta_fb_ff - delta_fly_buy & delta_fly_fly
- ratio_delta_qbod_bf - quantity_buy_one_day & delta_fly_buy
- ratio_price_duration - full_price_scaled & flight_duration_scaled
- ratio_13_17 - field13 & field17
- остальные смотри в new_ratio_features

In [None]:
def user_time_features(user):
    user['cumsum_delta_buy_buy'] = np.cumsum(user['delta_buy_buy'])
    user['delta_fly'] = user['cumsum_delta_buy_buy'] + user['delta_fly_buy']
    
    val_counts = user['delta_fly'].value_counts()
    unique = val_counts.index
    new_val_counts = dict()
    for i in unique:
        new_val_counts[i] = val_counts.get(i - 1, 0) + val_counts.get(i, 0) + val_counts.get(i + 1, 0)
    user['quantity_buy_one_day'] = user['delta_fly'].apply(lambda x: new_val_counts[x])
    
    mean_count = np.asarray(list(new_val_counts.values())).mean()
    user['mean_quantity_buy_one_day'] = mean_count
    k = len(new_val_counts)
    mean_count = (mean_count * k + 1 * 5) / (k + 5)
    user['quantity_buy_one_day_averaged'] = user['delta_fly'].apply(lambda x: new_val_counts[x]) / mean_count
    
    delta_fly = np.asarray(user['delta_fly'])
    delta_fly.sort()
    delta_ff = np.append(0, delta_fly[1:] - delta_fly[:-1])
    delta_ff_dict = dict(zip(delta_fly, delta_ff))
    user['delta_fly_fly'] = user['delta_fly'].apply(lambda x: delta_ff_dict[x])

    
def user_statistical_features(user):
    columns = ['delta_buy_buy', 'flight_duration_scaled', 'number_tickets', 'delta_fly_buy', 
               'field10', 'field12', 'field13', 'field17', 'field19', 'field22', 'field25', 'field26',
               'field27', 'field6', 'field7', 'field8', 'delta_fly', 'quantity_buy_one_day',
               'quantity_buy_one_day_averaged', 'mean_quantity_buy_one_day', 'delta_fly_fly', 
               'ticket_price', 'ratio_delta_fb_ff', 'ratio_delta_qbod_bf', 'ratio_price_duration',
               'ratio_13_17', 'ratio_22_25', 'ratio_26_27', 'ratio_13_22', 'ratio_17_25', 'ratio_6_12'
              ]
    
    for column in columns:
        user[f'mean_{column}_user'] = np.mean(user[column])
        user[f'std_{column}_user'] = np.std(user[column])

def new_ratio_features(df):
    df['ratio_delta_fb_ff'] = (np.log(df['delta_fly_buy'] + 1) - np.log(df['delta_fly_fly'] + 1))
    df['ratio_delta_qbod_bf'] = (np.log(df['quantity_buy_one_day'] + 1) - np.log(df['delta_fly_buy'] + 1))
    df['ratio_price_duration'] = (np.log(df['full_price_scaled'] + 1) - np.log(df['flight_duration_scaled'] + 1))
    
    df['ratio_13_17'] = (np.log(df['field13'] + 1) - np.log(df['field17'] + 1))
    df['ratio_22_25'] = (np.log(df['field22'] + 1) - np.log(df['field25'] + 1))
    df['ratio_26_27'] = (np.log(df['field26'] + 1) - np.log(df['field27'] + 1))
    
    df['ratio_13_22'] = (np.log(df['field13'] + 1) - np.log(df['field22'] + 1))
    df['ratio_17_25'] = (np.log(df['field17'] + 1) - np.log(df['field25'] + 1))
    df['ratio_6_12'] = (np.log(df['field6'] + 1) - np.log(df['field12'] + 1))
        
def new_user_features(df, left_border=0, right_border=0):
    user_ids = np.unique(df['userid'])
    if right_border == 0:
        right_border = len(user_ids)
    user_ids = user_ids[left_border: right_border]
    df_new = pd.DataFrame()
    
    print(f'Total users: {right_border - left_border}')
    i = 0
    for user_id in user_ids:
        user = df[df['userid'] == user_id]
        user.sort_values(by=['buy_number'], inplace=True)
        user_time_features(user)
        user['number_all_user_tickets'] = np.sum(user['number_tickets'])
        new_ratio_features(user)
        user_statistical_features(user)

        user.loc[user['buy_number'] == 1, 'delta_buy_buy'] = -999
        user.loc[user['buy_number'] == 1, 'delta_fly_buy'] = -999
        user.loc[user['buy_number'] == 1, 'delta_fly_fly'] = -999
        print(f'Completed {i}', end='\r')
        i += 1
        df_new = pd.concat([df_new, user])

    return df_new

Так как все эти операции занимают очень много времени для моего компьютера, то я разбивал на части, считал для кусков и сохранял.

In [None]:
%%time
I = 0
test_new_1 = new_user_features(test, I * 10000, (I + 1) * 10000)

test_new_1.to_csv(f'test_{I}.csv',index=False)