In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt 
import datetime
from sklearn.metrics import fbeta_score,roc_auc_score
#from mochi import f_beta_01
import re

In [2]:
from scipy.stats import boxcox
from sklearn.preprocessing import normalize

In [40]:
from mochi import check_nan_inf

In [3]:
class Config:
    pass
config = pd.read_pickle('config.pkl')
data_path = config.data_path
feature_path = config.feature_path


In [4]:
normalized_path = '../../kaggleData/JD_logging/normalized/'

In [5]:
features =   (config.feature_dict['trade_detail_feature']+
              config.feature_dict['recent_login_detail']+
              config.feature_dict['trade_and_recent_login_comparing']+
              config.feature_dict['login_trade_hist_stats']+
              config.feature_dict['llc_user_habbit']+
              config.feature_dict['hcc_user_habbit']+
              config.feature_dict['hcc_properties']+
              config.feature_dict['hcc_target_encoding']+
              config.feature_dict['login_detail_new_features']+
              config.feature_dict['hcc_trade_properties']+
              config.feature_dict['hcc_mult_target_encoding']+
              config.feature_dict['hcc_user_trade_habbit']
             )
feature_sequence_list = []
for feature in features:
    feature_sequence_list.append(pd.read_pickle(feature_path+feature+'.pkl').reshape(-1,1))
    
trade_tt_mat = np.hstack(feature_sequence_list)
#trade_tt_mat[trade_tt_mat==-10]=np.nan

validation_tuple_list = config.single_module_validation_indice_set
train_labels = pd.read_pickle(data_path+'trade_train_label.pkl')

### generating the feature set

In [6]:
#checking the filled nan by -10, computing the emptyness
trade_df = pd.DataFrame(trade_tt_mat,columns = features)
trade_df[trade_df ==-10] = np.nan
train_trade = trade_df.iloc[config.train_2_6_index]


In [7]:
#print(dir(config))
print('Currently has feature-sets')
print(config.feature_dict.keys())

Currently has feature-sets
dict_keys(['trade_and_recent_login_comparing', 'recent_login_detail', 'trade_detail_feature', 'login_trade_hist_stats', 'llc_user_habbit', 'hcc_user_habbit', 'hcc_properties', 'hcc_target_encoding', 'login_detail_new_features', 'hcc_mult_target_encoding', 'hcc_user_trade_habbit', 'hcc_trade_properties'])


In [8]:
processed_features = []
#processed_feature_names = []

### Check data type for features

In [9]:
types = trade_df['device_comparing_login_1'].apply(lambda x : type(x)).unique()

In [10]:
type_list = []
type_set =set([])
for feature in features:
    unique_list = list(trade_df[feature].apply(lambda x : type(x)).unique())
    type_list.append((feature,unique_list))
    type_set.update(unique_list)

### dealing with boolean features

In [11]:
#boolean features
boolean_features = []
for feature,unique_types in type_list:
    if bool in unique_types:
        boolean_features.append((feature,unique_types))
boolean_features

[('is_scan_login_0', [float, bool]),
 ('is_scan_login_1', [float, bool]),
 ('is_scan_login_2', [float, bool]),
 ('has_trade_login_0', [float, bool]),
 ('has_trade_login_1', [float, bool]),
 ('has_trade_login_2', [float, bool]),
 ('device_comparing_login_1', [bool]),
 ('device_comparing_login_2', [bool]),
 ('ip_comparing_login_1', [bool]),
 ('ip_comparing_login_2', [bool]),
 ('city_comparing_login_1', [bool]),
 ('city_comparing_login_2', [bool]),
 ('log_from_comparing_login_1', [bool]),
 ('log_from_comparing_login_2', [bool]),
 ('result_comparing_login_1', [bool]),
 ('result_comparing_login_2', [bool]),
 ('type_comparing_login_1', [bool]),
 ('type_comparing_login_2', [bool]),
 ('multiple_fails_1', [bool]),
 ('multiple_fails_15', [bool]),
 ('multiple_fails_3', [bool]),
 ('multiple_fails_30', [bool]),
 ('multiple_fails_360', [bool]),
 ('multiple_fails_7', [bool])]

In [12]:
#filling the -10 to be false
feature_with_nan = [x[0] for x in boolean_features if float in x[1]]

In [13]:
for feature in feature_with_nan:
    trade_df[feature]= trade_df[feature].fillna(False)

In [14]:
processed_features.extend(x[0] for x in boolean_features)

### dealing with other types, check numerical or categorical first

In [15]:
#check numerical or categorical by getting the unique value size
other_features = [x[0] for x in type_list if bool not in x[1]]

In [16]:
other_features_unique_size = [(x,trade_df[x].unique().shape[0]) for x in other_features]
other_features_unique_size = sorted(other_features_unique_size,key = lambda x:x[1],)

In [17]:
other_features_unique_size

[('type_login_0', 4),
 ('type_login_1', 4),
 ('type_login_2', 4),
 ('recent_login_number', 4),
 ('weekday_cycle', 6),
 ('fail_-1_rate_30', 6),
 ('weekday', 7),
 ('weekday_cycle_login_0', 7),
 ('weekday_cycle_login_1', 7),
 ('weekday_cycle_login_2', 7),
 ('fail_-1_count_30', 7),
 ('result_login_0', 8),
 ('result_login_1', 8),
 ('weekday_login_0', 8),
 ('weekday_login_1', 8),
 ('weekday_login_2', 8),
 ('fail_-1_count_360', 9),
 ('log_from_login_0', 10),
 ('log_from_login_2', 10),
 ('result_login_2', 10),
 ('log_from_login_1', 11),
 ('login_fail_times_1', 11),
 ('fail_-1_rate_360', 12),
 ('login_fail_times_3', 14),
 ('login_fail_times_7', 17),
 ('fail_-2_rate_30', 17),
 ('city_freq_rank_7', 18),
 ('login_fail_times_15', 19),
 ('city_freq_rank_30_t_t', 20),
 ('fail_-2_count_30', 21),
 ('login_fail_times_30', 22),
 ('city_freq_rank_30', 22),
 ('city_used_count_7', 22),
 ('hour', 24),
 ('hour_cycle', 24),
 ('city_used_count_30_t_t', 24),
 ('hour_login_0', 25),
 ('hour_login_1', 25),
 ('hour_

- categorical features including :
- type_login_
- log_from_login
- result_login_

In [18]:
categorical_features = []
numerical_features = [x[0] for x in other_features_unique_size]
to_be_remap = []

for feature,size in other_features_unique_size:
    
    if re.match('^type_login_.*',feature) or re.match('^log_from_login.*',feature) or  re.match('^result_login_.*',feature):
        categorical_features.append(feature)
        numerical_features.remove(feature)
    if re.match('.*cycle.*',feature):
        numerical_features.remove(feature)
    elif re.match('^hour.*',feature) or re.match('^weekday.*',feature) or re.match('^day.*',feature):
        to_be_remap.append(feature)
        numerical_features.remove(feature)
        
    

### Apply box_cox on some of the numerical features

In [19]:
def fill_median_and_apply_boxcox_and_nomalize(feature):
    #deal with nan
    if np.sum(np.isnan(trade_df[feature].astype(float))):
        temp_column = trade_df[feature].fillna(train_trade[feature].dropna().median()).astype(float)
    else:
        temp_column = trade_df[feature].astype(float)
        
    #check the minimun
    if temp_column.min()<=0:
        print('meet value smaller than 0 at %s, minimun is %f' % (feature,temp_column.min()))
        temp_column+= (np.abs(temp_column.min())+1)
       
        
    #apply box_cox for the feature 
    result = normalize(boxcox(temp_column)[0].reshape(-1,1))
    
    return result.reshape(-1,)

In [20]:
for feature in numerical_features:
    trade_df[feature] = fill_median_and_apply_boxcox_and_nomalize(feature)
processed_features.extend(numerical_features)

meet value smaller than 0 at recent_login_number, minimun is 0.000000
meet value smaller than 0 at fail_-1_rate_30, minimun is 0.000000
meet value smaller than 0 at fail_-1_count_30, minimun is 0.000000
meet value smaller than 0 at fail_-1_count_360, minimun is 0.000000
meet value smaller than 0 at login_fail_times_1, minimun is 0.000000
meet value smaller than 0 at fail_-1_rate_360, minimun is 0.000000
meet value smaller than 0 at login_fail_times_3, minimun is 0.000000
meet value smaller than 0 at login_fail_times_7, minimun is 0.000000
meet value smaller than 0 at fail_-2_rate_30, minimun is 0.000000
meet value smaller than 0 at login_fail_times_15, minimun is 0.000000
meet value smaller than 0 at fail_-2_count_30, minimun is 0.000000
meet value smaller than 0 at login_fail_times_30, minimun is 0.000000
meet value smaller than 0 at ip_diff_id_counts_7, minimun is 0.000000
meet value smaller than 0 at fail_-2_count_360, minimun is 0.000000
meet value smaller than 0 at login_fail_time

meet value smaller than 0 at device_max_min_dist_30, minimun is 0.000000
meet value smaller than 0 at trade_login_fail_rate_15, minimun is 0.000000
meet value smaller than 0 at log_from_21_rate_360, minimun is 0.000000
meet value smaller than 0 at type_3_count_360, minimun is 0.000000
meet value smaller than 0 at type_2_count_360, minimun is 0.000000
meet value smaller than 0 at log_from_2_count_360, minimun is 0.000000
meet value smaller than 0 at is_scan_rate_30, minimun is 0.000000
meet value smaller than 0 at log_from_10_rate_30, minimun is 0.000000
meet value smaller than 0 at timelong_login_0_mod_1000, minimun is 0.000000
meet value smaller than 0 at timelong_login_1_mod_1000, minimun is 0.000000
meet value smaller than 0 at timelong_login_2_mod_1000, minimun is 0.000000
meet value smaller than 0 at type_1_count_360, minimun is 0.000000
meet value smaller than 0 at type_2_rate_30, minimun is 0.000000
meet value smaller than 0 at after_fail_min_3, minimun is 0.000000
meet value sm

meet value smaller than 0 at device_most_used_id_rate_360, minimun is 0.000000
meet value smaller than 0 at device_login_count_360, minimun is 0.000000
meet value smaller than 0 at device_same_id_login_rate_360_t, minimun is 0.000000
meet value smaller than 0 at device_same_id_login_rate_360, minimun is 0.000000
meet value smaller than 0 at timelong_min_360, minimun is 0.000000
meet value smaller than 0 at trade_login_rate_360, minimun is 0.000000
meet value smaller than 0 at device_t_encoding, minimun is 0.000000
meet value smaller than 0 at trade_login_success_rate_360, minimun is 0.000000
meet value smaller than 0 at ip_device_t_encoding, minimun is 0.000000
meet value smaller than 0 at timelong_min_30, minimun is 0.000000
meet value smaller than 0 at timelong_min_15, minimun is 0.000000
meet value smaller than 0 at timelong_min_7, minimun is 0.000000
meet value smaller than 0 at timelong_min_3, minimun is 0.000000
meet value smaller than 0 at timelong_min_1, minimun is 0.000000
mee

### remaping some of the features to be new categorical

In [26]:
def map_hour(hour):
    if hour ==0 or hour>17:
        #18-0
        return 0
    elif hour>14:
        #15-17
        return 1
    elif hour>11:
        #12-15
        return 2
    elif hour>7:
        #8-11
        return 3
    else:
        #1-7
        return 4

def map_weekday(weekday):
    if weekday ==6 or weekday ==7:
        return True
    else:
        return False

def map_day(day):
    if day < 7:
        return 0
    elif day<15:
        return 1
    elif day<22:
        return 2
    else:
        return 3

In [28]:
for feature in to_be_remap:
    if re.match('^hour.*',feature):
        trade_df[feature] = trade_df[feature].apply(lambda x : map_hour(x))
    elif re.match('^weekday.*',feature):
        trade_df[feature] = trade_df[feature].apply(lambda x : map_weekday(x))
    elif re.match('^day.*',feature):
        trade_df[feature] = trade_df[feature].apply(lambda x : map_day(x))

categorical_features.extend(to_be_remap)

### mapping the categorical features

In [37]:
for feature in categorical_features:
    possible_values = trade_df[feature].unique()
    for value in possible_values:
        trade_df[feature+'_'+str(value)] = (trade_df[feature]==value)
        processed_features.append(feature+'_'+str(value))

### Check and generate the result feature set

In [47]:
trade_tt_mat = trade_df[processed_features]
for feature in [x[0] for x in boolean_features]:
    trade_tt_mat[feature] = trade_tt_mat[feature].apply(lambda x : 1 if x else 0)
check_nan_inf(trade_tt_mat,processed_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### generate the normalized features

In [48]:
for feature in processed_features:
    pd.to_pickle(trade_tt_mat[feature].values,normalized_path+feature+'.pkl')

In [53]:
config = pd.read_pickle('config.pkl')
config.normalized_path =normalized_path
config.normalized_features = processed_features
config.numerical_features = numerical_features

In [54]:
pd.to_pickle(config,'config.pkl')