In [4]:
# -*- coding: utf-8 -*-

import os
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from multiprocessing import Pool
from sklearn.model_selection import KFold


import warnings
warnings.filterwarnings('ignore')

# 获取数据文件地址
def getfilelist(dir, filelist):
    newdir = dir
    if os.path.isfile(dir):
        filelist.append(dir)
    elif os.path.isdir(dir):
        for s in os.listdir(dir):
            newdir = os.path.join(dir, s)
            getfilelist(newdir, filelist)
    return filelist


#修改工作时长小于0的值（暂定成nan之后删除）
def trans_to_nan (hours):
    if hours <0:
        hours = np.nan
    return hours


#数据处理，去除部件工作时长为负数的值,并且每个时间点只保留k个值
def preprocess (path,k):

    raw_data = pd.read_csv(path,engine='python')

    #将部件工作时长<0的时长值改为nan并删除
    # print(raw_data.columns)
    
    raw_data['部件工作时长'] = raw_data['部件工作时长'].map(lambda r:trans_to_nan(r))
    raw_data = raw_data.dropna()

    #每个工作时长至多保留k个值
    #提取部件工作时长列作为list方便处理
    raw_list = raw_data['部件工作时长'].tolist()
    for i in range(len(raw_list)-k):
        counter = 1
        #找到重复项最后一项的索引
        while counter + i < len(raw_list):
            if raw_list[i] == raw_list[i+counter]:
                counter += 1
            else:
                break
        #判断是否需要删除数据
        if counter <= k:
            continue
        else:
            for m in range(counter-k):
                raw_list[i+k+m] = np.nan
    #修改dataframe对应列
    raw_data['部件工作时长'] = raw_list
    raw_data = raw_data.dropna()

    return raw_data


#处理单个单本的数据,添加单个样本的特征
def feature_project (data,df,name,k):

    #根据样本选择或处理特征

    #开关与告警信号取其在总数据中的占比
    if name == '开关1信号' or name == '开关2信号' or name == '告警信号1':
        df[name + '时间占比'] = data.sum()/len(data)
    
    #温度信号取其均值与标准差为特征
    elif name == '温度信号' or name == '流量信号':
        df[name + '均值'] = data.mean()
        df[name + '标准差'] = data.std()
    
    #累积量参数取最大值，k个周期的差分的均值与标准差作为特征
    elif name == '累积量参数1' or name == '累积量参数2':
        df[name] = data.max()
        data = data.diff(periods = k)
        data = data.dropna()
        df[name + str(k) + '阶差分均值'] = data.mean()
        df[name + str(k) + '阶差分标准差'] = data.std()

    #电流信号主要集中分布在三段区间中，分别列出取均值与标准差，加权后取为特征
    elif name == '电流信号':
        length = len(data)
        low_current = list(num for num in data if 0 <= num < 20)
        mid_current = list(num for num in data if 500 <= num < 750)
        high_current = list(num for num in data if 800 <= num < 1800)
        low_percentage = np.sum(low_current) / length
        mid_percentage = np.sum(mid_current) / length
        high_percentage = np.sum(high_current) / length
        df[name + '低电流段均值'] = np.mean(low_current) * low_percentage
        df[name + '中电流段均值'] = np.mean(mid_current) * mid_percentage
        df[name + '高电流段均值'] = np.mean(high_current) * high_percentage
        df[name + '低电流段标准差'] = np.std(low_current) * low_percentage
        df[name + '中电流段标准差'] = np.std(mid_current) * mid_percentage
        df[name + '高电流段标准差'] = np.std(high_current) * high_percentage

    #流量信号主要集中分布在三段区间中，分别列出取均值与标准差，加权后取为特征
    elif name == '流量信号':
        length = len(data)
        low_current = list(num for num in data if 0 <= num < 9)
        mid_current = list(num for num in data if 10 <= num < 120)
        high_current = list(num for num in data if 125 <= num < 145)
        low_percentage = np.sum(low_current) / length
        mid_percentage = np.sum(mid_current) / length
        high_percentage = np.sum(high_current) / length
        df[name + '低流量段均值'] = np.mean(low_current) * low_percentage
        df[name + '中流量段均值'] = np.mean(mid_current) * mid_percentage
        df[name + '高流量段均值'] = np.mean(high_current) * high_percentage
        df[name + '低流量段标准差'] = np.std(low_current) * low_percentage
        df[name + '中流量段标准差'] = np.std(mid_current) * mid_percentage
        df[name + '高流量段标准差'] = np.std(high_current) * high_percentage
    
    #压力信号1主要分布在两段区间上，同上取均值与标准差加权后取为特征
    elif name == '压力信号1':
        length = len(data)
        low_pressure = list(num for num in data if 65 <= num <=75)
        high_pressure = list(num for num in data if 180 <= num <= 400)
        low_percentage = np.sum(low_pressure) / length
        high_percentage = np.sum(high_pressure) / length
        df[name + '信号1低压力段标准差'] = np.std(low_pressure) * low_percentage
        df[name + '信号1高压力段标准差'] = np.std(high_pressure) * high_percentage
    
    #压力信号2主要分布在一段区间上，剩余值较小，处理同上
    elif name == '压力信号2':
        length = len(data)
        low_pressure = list(num for num in data if 0 <= num <=50)
        high_pressure = list(num for num in data if 200 <= num)
        low_percentage = np.sum(low_pressure) / length
        high_percentage = np.sum(high_pressure) / length
        df[name + '信号2低压力段标准差'] = np.std(low_pressure) * low_percentage
        df[name + '信号2高压力段标准差'] = np.std(high_pressure) * high_percentage

    #同压力信号2
    elif name == '转速信号1':
        length = len(data)
        low_pressure = list(num for num in data if 0 <= num <=100)
        high_pressure = list(num for num in data if 3000 <= num)
        low_percentage = np.sum(low_pressure) / length
        high_percentage = np.sum(high_pressure) / length
        df[name + '信号1低转速段均值'] = np.mean(low_pressure) * low_percentage
        df[name + '信号1高转速段均值'] = np.mean(high_pressure) * high_percentage
        df[name + '信号1低转速段标准差'] = np.std(low_pressure) * low_percentage
        df[name + '信号1高转速段标准差'] = np.std(high_pressure) * high_percentage
    
    #同压力信号2
    elif name == '转速信号2':
        length = len(data)
        low_pressure = list(num for num in data if 0 <= num <=1000)
        high_pressure = list(num for num in data if 10000 <= num)
        low_percentage = np.sum(low_pressure) / length
        high_percentage = np.sum(high_pressure) / length
        df[name + '信号2极低转速段均值'] = np.mean(low_pressure) * low_percentage
        df[name + '信号2高转速段均值'] = np.mean(high_pressure) * high_percentage
        df[name + '信号2极低转速段标准差'] = np.std(low_pressure) * low_percentage
        df[name + '信号2高转速段标准差'] = np.std(high_pressure) * high_percentage

    return df


#耦合特征构造
def coupled_feature (dataframe,df):
    
    #取出列名表
    column_list = dataframe.columns.values.tolist()
    #循环将特征两两相乘组合
    for i in range (3,13):
        for j in range (i+1,13):
#            mutiple = dataframe.iloc[:,[i]]*dataframe.iloc[:,[j]]
            mutiple = dataframe.iloc[:,[i]].values * dataframe.iloc[:,[j]].values 
            df[column_list[i] +'与'+ column_list[j] +'乘积的均值'] = mutiple.mean()
            df[column_list[i] +'与'+ column_list[j] +'乘积的标准差'] = mutiple.std()
    
    return df


#处理单个训练样本
def process_sample_single(path,train_percentage=1,k=6):

    #获取并预处理数据
    data = preprocess(path,k)
    #获取该零件寿命
    work_life = data['部件工作时长'].max()
    #获取在寿命一定百分比时间的数据
    data=data[data['部件工作时长']<=work_life*train_percentage]
    #创建数据集
    dict_data = {'train_file_name': os.path.basename(path) + str(train_percentage),
                 'device': data['设备类型'][0],
                 '开关1_sum':data['开关1信号'].sum(),
                 '开关2_sum':data['开关2信号'].sum(),
                 '告警1_sum':data['告警信号1'].sum(),
                 'current_life':np.log(data['部件工作时长'].max()+1),
                 'rest_life':np.log(work_life-data['部件工作时长'].max()+1)
                }

    #单项特征
    for item in ['部件工作时长',
                    '累积量参数1',
                    '累积量参数2',
                    '转速信号1',
                    '转速信号2',
                    '压力信号1',
                    '压力信号2',
                    '温度信号',
                    '流量信号',
                    '电流信号',
                    '开关1信号',
                    '开关2信号',
                    '告警信号1']:
        dict_data=feature_project(data[item],dict_data,item,k)  

    #耦合特征
    dict_data=coupled_feature(data,dict_data)

    features = pd.DataFrame(dict_data, index=[0])
    return features

# 多进程调用单文件处理函数，并整合到一起
def get_together(cpu, listp,istest,func):

    if istest :
            train_p_list=[1]
            rst = []
            pool = Pool(cpu)
            for e in listp:
                for train_p in train_p_list:
                    rst.append(pool.apply_async(func, args=(e,train_p,)))
            pool.close()
            pool.join()
            
            # print(rst[0])
            
            rst = [i.get() for i in rst]
            
            # print(rst[0]) 
            
            tv_features=rst[0]
            for i in rst[1:]:
                tv_features = pd.concat([tv_features, i], axis=0)
            cols=tv_features.columns.tolist()
            
            try:      
                for col in [idx,ycol]:
                    cols.remove(col)
                cols=[idx]+cols+[ycol]
            except:
                cols=[idx]+cols+[ycol]
                
            tv_features[idx]=tv_features[idx].apply(lambda x:x[:-1])
            tv_features=tv_features.reindex(columns=cols)
    else:   
        train_p_list=np.arange(0.01,1,0.02)    # [0.45,0.55,0.63,0.75,0.85]  #=list(np.arange(0.05,1,0.05))
        rst = []
        pool = Pool(cpu)
        for e in listp:
            for train_p in train_p_list:
                # print train_p
                rst.append(pool.apply_async(func, args=(e,train_p, )))
        pool.close()
        pool.join()
        # print(rst)
        
        f_list = []
        
        for i in tqdm(rst):
            f_list.append(i.get())
        # rst = [i.get() for i in tqdm(rst)]
        rst = f_list
        
        
        tv_features=rst[0]
        for i in rst[1:]:
            tv_features = pd.concat([tv_features, i], axis=0)
        cols=tv_features.columns.tolist()
        
        try:      
            for col in [idx,ycol]:
                cols.remove(col)
            cols=[idx]+cols+[ycol]
        except:
            cols=[idx]+cols+[ycol]
            
        tv_features=tv_features.reindex(columns=cols)

    return tv_features

#评价指标
def compute_loss(target, predict):
    temp = np.log(abs(target + 1)) - np.log(abs(predict + 1))
    res = np.sqrt(np.dot(temp, temp) / len(temp))
    return res

#lgb
def lgb_cv(train, params, fit_params,feature_names, nfold, seed,test):
    train_pred = pd.DataFrame({
        'true': train[ycol],
        'pred': np.zeros(len(train))})
    test_pred = pd.DataFrame({idx: test[idx], ycol: np.zeros(len(test))},columns=[idx,ycol])
    kfolder = KFold(n_splits=nfold, shuffle=True, random_state=seed)
    for fold_id, (trn_idx, val_idx) in enumerate(kfolder.split(train)):
        print('\nFold_{fold_id} Training ================================\n'.format(fold_id = fold_id))
        lgb_trn = lgb.Dataset(
            data=train.iloc[trn_idx][feature_names],
            label=train.iloc[trn_idx][ycol],
            feature_name=feature_names)
        lgb_val = lgb.Dataset(
            data=train.iloc[val_idx][feature_names],
            label=train.iloc[val_idx][ycol],
            feature_name=feature_names)
        lgb_reg = lgb.train(params=params, train_set=lgb_trn,
                            num_boost_round = fit_params['num_boost_round'], verbose_eval = fit_params['verbose_eval'],
                            early_stopping_rounds = fit_params['early_stopping_rounds'], valid_sets=[lgb_trn, lgb_val])
        val_pred = lgb_reg.predict(
            train.iloc[val_idx][feature_names],
            num_iteration=lgb_reg.best_iteration)
        
        train_pred.loc[val_idx, 'pred'] = val_pred
        test_pred[ycol] += (np.exp(lgb_reg.predict(test[feature_names]))-1) 
    test_pred[ycol] = test_pred[ycol] / nfold
    score = compute_loss(pd.Series(np.exp(train_pred['true']) - 1).apply(max, args=(0,))
                         ,pd.Series(np.exp(train_pred['pred']) - 1).apply(max, args=(0,)))
    print('\nCV LOSS:', score)
    return test_pred


# from tqdm import tqdm
# import time
# for i in tqdm(range(1,100,1)):
#     time.sleep(1)
#     print i
idx='train_file_name'
ycol='rest_life'

# ====== lgb ======
params_lgb = {'num_leaves': 250, 
              'max_depth':5, 
              'learning_rate': 0.01,
              'objective': 'regression', 
              'boosting': 'gbdt',
              'verbosity': -1}

fit_params_lgb = {'num_boost_round': 800, 
                  'verbose_eval':200,
                  'early_stopping_rounds': 30}

# 执行主进程
if __name__ == '__main__':
    import time
    start = time.time()
    
    train_list = getfilelist('train', [])
    test_list = getfilelist('test1', [])
    
    n=4
    func=process_sample_single
    train=get_together(n,train_list,False,func)
    test =get_together(n,test_list,True,func)
    print("done.", time.time()-start)


    train.to_csv('train_total_features.csv', index=False)
    test.to_csv('test_total_features.csv', index=False)
    train.head()
    test.head()

In [475]:
for i in train.columns:
    print i

In [26]:
train.sample(frac=0.2, random_state=10).head()

Unnamed: 0,train_file_name,current_life,device,压力信号1与压力信号2乘积的均值,压力信号1与压力信号2乘积的标准差,压力信号1与告警信号1乘积的均值,压力信号1与告警信号1乘积的标准差,压力信号1与开关1信号乘积的均值,压力信号1与开关1信号乘积的标准差,压力信号1与开关2信号乘积的均值,...,转速信号2与流量信号乘积的标准差,转速信号2与温度信号乘积的均值,转速信号2与温度信号乘积的标准差,转速信号2与电流信号乘积的均值,转速信号2与电流信号乘积的标准差,转速信号2信号2极低转速段均值,转速信号2信号2极低转速段标准差,转速信号2信号2高转速段均值,转速信号2信号2高转速段标准差,rest_life
0,5767e300dd3a929fb6c5.csv0.7299999999999999,7.523211,S51d,38039.019182,24750.398619,0.201669,5.448333,63.420658,88.238148,0.0,...,927464.052368,1099919.0,433151.435056,46004760.0,19116850.0,105.444801,181.793188,425951200.0,148142400.0,6.530148
0,941021344dd1961e0974.csv0.35,6.388141,Saa3,56343.237409,34008.513838,0.060518,1.880349,116.511651,113.73129,0.0,...,808736.946494,1057191.0,366816.078649,20037500.0,10851800.0,925.5798,845.516734,369406300.0,108200900.0,7.008279
0,fa5979704821986a1f74.csv0.7099999999999999,7.090702,Saa3,59537.698294,43840.422166,1.052775,14.697357,118.338423,137.932076,0.0,...,717292.956722,923358.2,289639.181064,22136180.0,8427127.0,63.986891,47.706328,317576800.0,96288740.0,6.232939
0,620b71882ffe5d7a3142.csv0.8299999999999998,8.670172,S26a,50001.777266,31648.720417,1.540497,24.446939,108.820002,110.318677,0.0,...,653665.161033,1011937.0,507231.470213,19117930.0,10429810.0,42.942266,163.648809,312404700.0,85739350.0,7.095272
0,f473ef8589e0d5a74d00.csv0.8699999999999999,8.909674,S26a,77151.662561,47615.165142,0.271248,6.153255,201.958188,154.605073,0.0,...,662948.421468,1255727.0,447130.422801,23232170.0,7267486.0,8.703783,17.474922,401086000.0,97946020.0,7.010763


In [532]:
def get_similar_train_data(test, train, frac_start, frac_end, seed):
    
    shuffle_train = shuffle(train[train['train_file_name'].apply(lambda x: (float(x.split('.csv')[1])) >= 0.35 and (float(x.split('.csv')[1]) <= 0.85))], random_state = seed)
    sub_train = shuffle_train.iloc[int(frac_start*len(shuffle_train)):int((frac_end)*len(shuffle_train)), :]
    
    result_list = []
    
    for device in ['S100', 'S26a', 'Saa3', 'S51d', 'S508']:
        used_train_device_list = []
        used_train_device_percentile_list = []
        merged_df_list = []
        test_device = test.query('device=="{device}"'.format(device=device))
        train_device = sub_train.query('device=="{device}"'.format(device=device))
        
        left = test_device[[u'train_file_name',u'current_life']] ### .sort_values(u'current_life')
        left.columns = ['test_file_name','current_life']
        right = train_device[[u'train_file_name',u'current_life']].sort_values(u'current_life')
        right.columns=  ['train_file_name','train_current_life']
        
        index_col = right['train_file_name'].apply(lambda x: x.split('.csv')[0])
        
        for i in range(len(left)):
            tmp_left = left.iloc[i:i+1,:]
            tmp_right = right[~index_col.isin(used_train_device_list)]
            merged_df = pd.merge_asof(left = tmp_left, right = tmp_right, direction='nearest', left_on = u'current_life',right_on = u'train_current_life' )
            
            ## 如果匹配差异大，放宽多样性限制
            if abs(list(merged_df['current_life'])[0] - list(merged_df['train_current_life'])[0]) < 1:
                tmp_right = right[~right['train_file_name'].isin(used_train_device_percentile_list)]
                merged_df = pd.merge_asof(left = tmp_left, right = tmp_right, direction='nearest', left_on = u'current_life',right_on = u'train_current_life' )
            
            fname = list(merged_df['train_file_name'])[0]
            try:
                used_train_device_list.append(list(merged_df['train_file_name'])[0].split('.csv')[0])
                used_train_device_percentile_list.append(list(merged_df['train_file_name'])[0])
                merged_df_list.append(merged_df.copy())
            except:
                # 如果没匹配到，放宽多样性限制
                                
                tmp_right = right[~right['train_file_name'].isin(used_train_device_percentile_list)]
                merged_df = pd.merge_asof(left = tmp_left, right = tmp_right, direction='nearest', left_on = u'current_life',right_on = u'train_current_life' )
                ## print merged_df
                merged_df_list.append(merged_df.copy())

        result_list.append(pd.concat(merged_df_list))
    
    return pd.concat(result_list).drop_duplicates()
        

In [533]:
before_merge_list = []

for i in tqdm(range(50)):
    for frac in (0.0, 0.5):
        train_0807 = get_similar_train_data(test, train, frac, frac+0.5, i)
        train_files_df = train_0807[['train_file_name']]
        train_files_df.columns = ['train_files_subset']
        
        new_train = pd.merge(train_files_df, train, left_on = 'train_files_subset', right_on = 'train_file_name',how='left')
        new_train = new_train.drop(['train_files_subset'], axis=1)

        train_test=pd.concat([new_train,test],join='outer',axis=0).reset_index(drop=True)
        train_test=pd.get_dummies(train_test,columns=['device'])

        nfold = 5
        seed = 4096

        column_names = train_test.columns.values.tolist()
        special_column_names = ['device_S100','device_S26a','device_S508','device_S51d','device_Saa3','开关1_sum','开关2_sum','告警1_sum']
        special_column_names = [idx] + ['current_life'] + special_column_names + [ycol]

        for item in special_column_names:
            column_names.remove(item)

        train_test.fillna(0,inplace=True)
        
        for item in column_names:
            std_temp = train_test[item].std()

            if std_temp <= 1:
                train_test[item] = np.exp(train_test[item])
                std_temp2 = train_test[item].std()

                #check the standard deviation again
                if std_temp2 < 1:
                    del train_test[item]

            elif std_temp > 10:
                train_test[item] = np.log(train_test[item] + 1)  
                
        feature_name=list(filter(lambda x:x not in[idx,ycol],train_test.columns))

        sub = lgb_cv(train_test.iloc[:new_train.shape[0]] ,params_lgb, fit_params_lgb, 
                    feature_name, nfold,seed,train_test.iloc[new_train.shape[0]:])
        

        before_merge_list.append(sub)
    
sub_result_0807v1 = pd.concat(before_merge_list).groupby('train_file_name').mean().reset_index()
## LOSS 0.64
sub_result_0807v1.to_csv('resample_0807_sub_distinct_magic_number3585_100model.csv',index=False)

### ----以下可忽略

In [515]:
pd.concat(before_merge_list).groupby('train_file_name').mean().mean()

rest_life    1275.12786
dtype: float64

In [527]:
pd.concat(before_merge_list).groupby('train_file_name').mean().mean()

rest_life    1400.339959
dtype: float64

In [535]:
pd.concat(before_merge_list).groupby('train_file_name').mean().mean()

rest_life    1587.950391
dtype: float64

In [528]:
len(pd.concat(before_merge_list))

88900

In [534]:
sub_result_0807v3 = pd.concat(before_merge_list).groupby('train_file_name').mean().reset_index()
## LOSS 0.81
sub_result_0807v3.to_csv('resample_0807_sub_distinct_magic_number1585_100model.csv',index=False)

In [529]:
sub_result_0807v2 = pd.concat(before_merge_list).groupby('train_file_name').mean().reset_index()
## LOSS 0.71
sub_result_0807v2.to_csv('resample_0807_sub_distinct_magic_number2585_100model.csv',index=False)

In [520]:
# pd.concat(before_merge_list).groupby('train_file_name').std().describe()

In [521]:
sub_result_0807v1 = pd.concat(before_merge_list).groupby('train_file_name').mean().reset_index()

In [522]:
## LOSS 0.64
sub_result_0807v1.to_csv('resample_0807_sub_distinct_magic_number3585_100model.csv',index=False)

In [506]:
for i in tqdm(range(10)):
    time.sleep(1)

In [496]:
train_0806 = get_similar_train_data(test, train, 0.5, 1, 666) ## ['train_file_name'].describe()

In [498]:
# train_0806['train_file_name']

In [503]:
train_0806[['current_life','train_current_life']].head()

Unnamed: 0,current_life,train_current_life
0,6.116444,6.115892
0,8.15823,8.1578
0,7.102499,7.106196
0,8.345693,8.344921
0,5.894403,5.890262


In [500]:
train_0806['train_file_name'].describe()

count                                             889
unique                                            889
top       fa5979704821986a1f74.csv0.44999999999999996
freq                                                1
Name: train_file_name, dtype: object

In [501]:
train_files_df = train_0806[['train_file_name']]
train_files_df.columns = ['train_files_subset']

In [502]:
train_0806.mean()

current_life          7.264929
train_current_life    7.267529
dtype: float64

In [466]:
# train_files_df = concat_df[['train_file_name']]
# train_files_df.columns = ['train_files_subset']

In [467]:
# concat_df.mean()

In [468]:
new_train = pd.merge(train_files_df, train, left_on = 'train_files_subset', right_on = 'train_file_name',how='left')

In [469]:
new_train = new_train.drop(['train_files_subset'], axis=1)

In [470]:
train_test=pd.concat([new_train,test],join='outer',axis=0).reset_index(drop=True)
train_test=pd.get_dummies(train_test,columns=['device'])
#    feature_name=list(filter(lambda x:x not in[idx,ycol],train_test.columns))

nfold = 5
seed = 4096

column_names = train_test.columns.values.tolist()
special_column_names = ['device_S100','device_S26a','device_S508','device_S51d','device_Saa3','开关1_sum','开关2_sum','告警1_sum']
special_column_names = [idx] + ['current_life'] + special_column_names + [ycol]

for item in special_column_names:
    column_names.remove(item)

train_test.fillna(0,inplace=True)



In [471]:
for item in column_names:
    std_temp = train_test[item].std()

    if std_temp <= 1:
        train_test[item] = np.exp(train_test[item])
        std_temp2 = train_test[item].std()

        #check the standard deviation again
        if std_temp2 < 1:
            del train_test[item]

    elif std_temp > 10:
        train_test[item] = np.log(train_test[item] + 1)

#    column_diff_names = ['累积量参数16阶差分均值','累积量参数16阶差分标准差','累积量参数26阶差分均值','累积量参数26阶差分标准差']
#    
#    for item in column_diff_names:
#        
#        del train_test[item]


In [472]:
import sys
reload(sys)
sys.setdefaultencoding('utf8')


# ====== lgb ======
params_lgb = {'num_leaves': 250, 
              'max_depth':5, 
              'learning_rate': 0.01,
              'objective': 'regression', 
              'boosting': 'gbdt',
              'verbosity': -1}

fit_params_lgb = {'num_boost_round': 800, 
                  'verbose_eval':200,
                  'early_stopping_rounds': 10}

In [473]:
feature_name=list(filter(lambda x:x not in[idx,ycol],train_test.columns))

sub= lgb_cv(train_test.iloc[:new_train.shape[0]] ,params_lgb, fit_params_lgb, 
            feature_name, nfold,seed,train_test.iloc[new_train.shape[0]:])



In [None]:
sub7 = 

In [456]:
sub6 = sub.copy()

In [440]:
sub5 = sub.copy()

In [459]:
sub0806_v3 = sub5.copy()
sub0806_v3['rest_life'] = (sub5['rest_life'] + sub6['rest_life']) / 2
sub0806_v3.to_csv('resample_0806_sub_distinct_magic_number_35_85.csv',index=False)

In [412]:
sub4 = sub.copy()

In [398]:
sub3 = sub.copy()

In [415]:
sub0806_v2 = sub3.copy()
sub0806_v2['rest_life'] = (sub3['rest_life'] + sub4['rest_life']) / 2
sub0806_v2.to_csv('resample_0806_sub_distinct_no_magic_number.csv',index=False)

In [358]:
sub2 = sub.copy()

In [345]:
sub1 = sub.copy()

In [367]:
sub0806_v1 = sub1.copy()
sub0806_v1['rest_life'] = (sub1['rest_life'] + sub2['rest_life']) / 2
sub0806_v1.to_csv('resample_0806_sub_distinct.csv',index=False)

In [365]:
sub0806_v1['rest_life'] = sub0806_v1['rest_life'] / 2

In [366]:
sub0806_v1.to_csv('resample_0806_sub_distinct.csv',index=False)

In [359]:
sub2.head()

Unnamed: 0,train_file_name,rest_life
889,13df215673fe44d8dae5.csv,283.16865
890,8b3086c25931ade73b97.csv,3000.80974
891,fc31ec603291154b51bc.csv,362.127285
892,c00f30bb30fd5348984d.csv,1646.427478
893,8a5426b1c63a6cf107c4.csv,205.29993


In [360]:
sub1.head()

Unnamed: 0,train_file_name,rest_life
889,13df215673fe44d8dae5.csv,368.879807
890,8b3086c25931ade73b97.csv,2586.860969
891,fc31ec603291154b51bc.csv,419.896425
892,c00f30bb30fd5348984d.csv,1434.700117
893,8a5426b1c63a6cf107c4.csv,280.516224


In [None]:
sub.to_csv('resample_0804_sub_round_01_800_trunc_upper_v4.csv',index=False)
print("process(es) done.", time.time()-start)

In [158]:
import sys
reload(sys)
sys.setdefaultencoding('utf8')


# ====== lgb ======
params_lgb = {'num_leaves': 250, 
              'max_depth':5, 
              'learning_rate': 0.01,
              'objective': 'regression', 
              'boosting': 'gbdt',
              'verbosity': -1}

fit_params_lgb = {'num_boost_round': 900, 
                  'verbose_eval':200,
                  'early_stopping_rounds': 30}

In [159]:
feature_name=list(filter(lambda x:x not in[idx,ycol],train_test.columns))

sub= lgb_cv(train_test.iloc[:new_train.shape[0]] ,params_lgb, fit_params_lgb, 
            feature_name, nfold,seed,train_test.iloc[new_train.shape[0]:])

sub.to_csv('resample_0804_sub_round_01_900_v2.csv',index=False)
print("process(es) done.", time.time()-start)

In [252]:
(np.exp(new_train['rest_life']-1)).describe()

count    8890.000000
mean      464.371363
std       505.579956
min        15.542906
25%       113.582777
50%       298.442197
75%       657.308592
max      5166.498872
Name: rest_life, dtype: float64

In [253]:
sub.describe()

Unnamed: 0,rest_life
count,889.0
mean,1075.91242
std,866.226476
min,97.912072
25%,338.623033
50%,869.317808
75%,1662.95233
max,7128.778048


In [138]:
# for i in feature_name:
#     print type(i)

In [None]:
feature_name=list(filter(lambda x:x not in[idx,ycol],train_test.columns))

sub= lgb_cv(train_test.iloc[:train.shape[0]] ,params_lgb, fit_params_lgb, 
            feature_name, nfold,seed,train_test.iloc[train.shape[0]:])

sub.to_csv('baseline_sub8.csv',index=False)
print("process(es) done.", time.time()-start)


import matplotlib.pyplot as plt
import matplotlib

"""
绘制直方图
data:必选参数，绘图数据
bins:直方图的长条形数目，可选项，默认为10
normed:是否将得到的直方图向量归一化，可选项，默认为0，代表不归一化，显示频数。normed=1，表示归一化，显示频率。
facecolor:长条形的颜色
edgecolor:长条形边框的颜色
alpha:透明度
"""

account = 0

for item in feature_name:
    plt.subplot(11,11,1+account)
    plt.title('Feature_name:{0}'.format(item))
    plt.hist(train_test[item], bins=20, normed=0, facecolor="blue", edgecolor="black", alpha=0.7)
    account += 1





In [218]:
new_train['train_file_name'].apply(lambda x: float(x.split('.csv')[1])) >= 0.45

0       False
1        True
2       False
3       False
4       False
5        True
6       False
7       False
8       False
9       False
10       True
11       True
12       True
13       True
14       True
15      False
16       True
17      False
18       True
19       True
20      False
21       True
22      False
23       True
24      False
25       True
26      False
27      False
28      False
29      False
        ...  
8860     True
8861     True
8862     True
8863     True
8864     True
8865     True
8866     True
8867     True
8868     True
8869     True
8870     True
8871     True
8872     True
8873     True
8874     True
8875     True
8876     True
8877     True
8878     True
8879    False
8880    False
8881     True
8882     True
8883     True
8884     True
8885     True
8886     True
8887     True
8888     True
8889     True
Name: train_file_name, Length: 8890, dtype: bool