In [1]:
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import config
import re
from numba import jit
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import log_loss,roc_curve
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from itertools import chain, combinations

In [2]:
def get_cosine(vec1, vec2):
    vec1=Counter(vec1)
    vec2=Counter(vec2)
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator
            
def timestamp_datetime(value):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(value))

def time_feat(df,featList,featName):
    df[featName] = df.groupby(featList)['context_timestamp'].rank(method='first')   
    return df

def powerset(iterable):
    s = list(iterable)
    return list(chain.from_iterable(combinations(s, r) for r in range(len(s)+1)))

def ks_metric(true,score):
    fpr, tpr, thresholds = roc_curve(true,score)
    ks = max(tpr-fpr)
    return ks 

In [3]:




def process(df):
    df['time'] = df.context_timestamp.apply(timestamp_datetime)
    df['day'] = df.time.apply(lambda x: int(x[8:10]))
    df['hour'] = df.time.apply(lambda x: int(x[11:13]))
    '''for lst in timeFeatList:
        df = time_feat(df,lst,'_'.join(lst))'''
    df['item_property_list'] = df['item_property_list'].apply(lambda x:';'.join(sorted(set(str(x).split(';')))))
    df['predict_category_property'] = df['predict_category_property'].apply(lambda x:';'.join(sorted(set(str(x).split(';')))))
    df['len_item_property_list'] = df['item_property_list'].apply(lambda x: len(str(x).split(';')))
    df['len_predict_category_property'] = df['predict_category_property'].apply(lambda x: len(str(x).split(';')))
    lbl = LabelEncoder()
    for i in range(1,3):
        df['item_category_list_bin%d'%i] = lbl.fit_transform(df['item_category_list'].apply(lambda x: x.split(';')[i] if len(x.split(';'))>i else ''))
    for i in range(10):
        df['predict_category_property%d'%i] = lbl.fit_transform(df['predict_category_property'].apply(lambda x: x.split(';')[i] if len(x.split(';'))>i else ''))
    
    #df["missing_feat"] = np.sum((df == -1).values, axis=1)
    return df

def labelencoder(df):
    lbl = LabelEncoder()
    for var in ['user_id','item_id','shop_id','item_brand_id','item_city_id']:
        df[var] = lbl.fit_transform(df[var])
    return df
        

def text_cosine(df):
    df['tmp_cate'] = df['item_category_list'].apply(lambda x: x.split(';')[2] if len(x.split(';'))>2 else x.split(';')[1])
    df['cate_predict_chk']=list(map(lambda x,y: 1 if x in y else 0 , df['tmp_cate'],df['predict_category_property']))
    del df['tmp_cate']
    
    df['tmp_set_predict_property'] =df['predict_category_property'].apply(lambda x: set(re.split('[:;]',x)[1::2]))
    df['tmp_set_item_property_list'] =df['item_property_list'].apply(lambda x: set(re.split('[;]',x)))
    df['property_join_cnt'] = df[['tmp_set_predict_property','tmp_set_item_property_list']].apply(lambda x: len(x[0]&x[1])*1.0/len(x[0]|x[1]),axis=1)
    df['property_gap1_cnt'] = df[['tmp_set_predict_property','tmp_set_item_property_list']].apply(lambda x: len(x[0]-x[1])*1.0/len(x[0]|x[1]),axis=1)
    df['property_gap2_cnt'] = df[['tmp_set_predict_property','tmp_set_item_property_list']].apply(lambda x: len(x[1]-x[0])*1.0/len(x[0]|x[1]),axis=1)
    del df['tmp_set_predict_property']
    del df['tmp_set_item_property_list']
    
    return df
    
    
    
def same_day_trick(df,key_var=[]):
    if not isinstance(key_var,list):
        key_var = [key_var]
    nameBase = '_'.join(key_var)
    ###当天前后的数据情况
    df[nameBase+'_before_exist'] = ((df.groupby(key_var+['day'])['context_timestamp'].rank(method='min') - 1) > 0).astype(int)
    df[nameBase+'_after_exist'] = ((df.groupby(key_var+['day'])['context_timestamp'].rank(method='min',ascending=False)- 1) > 0).astype(int)
    df[nameBase+'_sametime_exist'] = ((df.groupby(key_var+['day'])['context_timestamp'].rank(method='max') - df.groupby(key_var+['day'])['context_timestamp'].rank(method='min')) > 0).astype(int)
    #df = df.merge(df.groupby(key_var+['day'],as_index=False)['context_timestamp'].agg({nameBase+'_day_cnt':'count'}),'inner',key_var+['day'])
    return df    

def focus_one_record(df,key_var=[],time_diff=False):
    if not isinstance(key_var,list):
        key_var = [key_var]
    nameBase = '_'.join(key_var)
    ###当天前后的数据情况
    df[nameBase+'_before_cnt'] = df.groupby(key_var+['day'])['context_timestamp'].rank(method='min') - 1
    df[nameBase+'_after_cnt'] = df.groupby(key_var+['day'])['context_timestamp'].rank(method='min',ascending=False)- 1
    df[nameBase+'_sametime_cnt'] = df.groupby(key_var+['day'])['context_timestamp'].rank(method='max') - df.groupby(key_var+['day'])['context_timestamp'].rank(method='min')+1
    df = df.merge(df.groupby(key_var+['day'],as_index=False)['context_timestamp'].agg({nameBase+'_day_cnt':'count'}),'inner',key_var+['day'])
    for feat in ['_before_cnt','_after_cnt','_sametime_cnt']:
        df[nameBase+feat+'_ratio'] = df[nameBase+feat]*1.0/df[nameBase+'_day_cnt']
                
    ###前一天购买/浏览的数量
    '''dfTmp =  df.groupby(key_var+['day'],as_index=False)['is_trade'].agg({nameBase+'_preday_trade_cnt':'sum',nameBase+'_preday_cnt':'count'})
    dfTmp['day'] = dfTmp['day']+1
    df = df.merge(dfTmp,'left',key_var+['day'])
    df[nameBase+'_preday_trade_ratio'] = df[nameBase+'_preday_trade_cnt']*1.0/df[nameBase+'_preday_cnt']
    for feat in ['_preday_trade_cnt','_preday_cnt','_preday_trade_ratio']:
        df[nameBase+feat].fillna(0,inplace=True)'''
        
    if time_diff:
        ###广告展示上下间隔
        dfTmp = df[[nameBase+'_before_cnt',nameBase+'_after_cnt',nameBase+'_sametime_cnt','time']+key_var+['day']]
        dfTmp.rename(columns={'time':'new_time'},inplace=True)
        dfTmp['next_record'] = dfTmp[nameBase+'_before_cnt'] + dfTmp[nameBase+'_sametime_cnt'] + 1
        dfTmp['last_record'] = dfTmp[nameBase+'_after_cnt'] + dfTmp[nameBase+'_sametime_cnt'] + 1
        df = df.merge(dfTmp[key_var+['day','next_record','new_time']],'left',left_on = key_var+['day',nameBase+'_before_cnt'],right_on = key_var+['day','next_record'])
        df[nameBase + '_next_time_dur'] = (pd.to_datetime(df['time'])-pd.to_datetime(df['new_time'])).dt.seconds
        df[nameBase + '_next_time_dur'].fillna(999999,inplace=True)
        df.loc[df[nameBase+'_sametime_cnt']>1,nameBase + '_next_time_dur'] = 0
        del df['new_time']
        del df['next_record']

        df = df.merge(dfTmp[key_var+['day','last_record','new_time']],'left',left_on = key_var+['day',nameBase+'_after_cnt'],right_on = key_var+['day','last_record'])
        df[nameBase + '_last_time_dur'] = (pd.to_datetime(df['new_time'])-pd.to_datetime(df['time'])).dt.seconds
        df[nameBase + '_last_time_dur'].fillna(999999,inplace=True)
        df.loc[df[nameBase+'_sametime_cnt']>1,nameBase + '_last_time_dur'] = 0
        del df['new_time']
        del df['last_record']
    for feat in ['_before_cnt','_after_cnt','_sametime_cnt']:
        del df[nameBase+feat]
    return df    
    

def _offline_feat(df,key_var='user_id',stat_var=[],part_var=[],mean_var=[],train_feat_col=None):
    if not isinstance(key_var,list):
        key_var = [key_var]
    left_key = key_var.copy()
    base_name = '~'.join(key_var)
    if train_feat_col:
        key_var.append(train_feat_col[1])
        left_key.append(train_feat_col[0])
        
    
    df = df.merge(df.groupby(key_var,as_index=False)['instance_id'].agg({base_name+'_cnt':'count'}).rename(columns={train_feat_col[1]:train_feat_col[0]}),'left',left_key)
    df = df.merge(df.groupby(key_var,as_index=False)['is_trade'].agg({base_name+'_trade_cnt':'sum',base_name+'_trade_ratio':'mean'}).rename(columns={train_feat_col[1]:train_feat_col[0]}),'left',left_key)
    df[base_name+'_notrade_cnt'] = df[base_name+'_cnt']-df[base_name+'_trade_cnt']
    dfTmp = df.loc[df['is_trade']==1]
    for stat in stat_var:
        df = df.merge(df.groupby(key_var,as_index=False)[stat].agg({base_name+'_'+stat+'_min':'min',base_name+'_'+stat+'_max':'max'}).rename(columns={train_feat_col[1]:train_feat_col[0]}),'left',left_key)    
    for part in part_var:
        df = df.merge(df.groupby(key_var,as_index=False)[part].agg({base_name+'_'+part+'_cnt':'nunique'}).rename(columns={train_feat_col[1]:train_feat_col[0]}),'left',left_key)
        df = df.merge(dfTmp.groupby(key_var,as_index=False)[part].agg({base_name+'_'+part+'_trade_cnt':'nunique'}).rename(columns={train_feat_col[1]:train_feat_col[0]}),'left',left_key)
        df[base_name+'_'+part+'_trade_cnt'].fillna(0,inplace=True)
        df[base_name+'_'+part+'_trade_ratio'] = 1.0*df[base_name+'_'+part+'_trade_cnt']/df[base_name+'_'+part+'_cnt']
    for var in mean_var:
        df = df.merge(df.groupby(key_var+[var],as_index=False)['is_trade'].sum().groupby(key_var,as_index=False)['is_trade'].agg({base_name+'_'+var+'_avg_trade':'mean'}).rename(columns={train_feat_col[1]:train_feat_col[0]}),'left',left_key)
    return df








def map_col(df,drop=False):
    map_dict = {
        'item_price_level':[4,5,6,7,8,9],
        'item_sales_level':[4,6,9,10,11,12,13,14,16],
        'item_pv_level':[6,9,10,11,12,13,14,15,16,17,18,19,20],
        'user_age_level':[1001,1002,1003,1004,1005],
        'context_page_id':[4001,4002,4004,4006,4008,4010,4013,4016,4018],
        'shop_review_num_level':[5,9,14,15,16,17,18,20,21],
        'hour':[6,9,12,17,20],
        'user_occupation_id':{-1:2003},
        'user_star_level':{-1:3000}
    }
    for key,value in map_dict.items():
        if isinstance(value,list):
            df[key+'_mapped'] = 0
            for i in range(len(value)):
                df.loc[df[key]>value[i],key+'_mapped'] = i+1
        else:
            '''df[key+'_mapped'] = df[key]
            for key_sub,value_sub in value.items():
                df.loc[df[key]==key_sub,key+'_mapped'] = value_sub'''
            df[key+'_mapped'] = df[key].apply(lambda x:value.get(x,x))
        if drop:
            df[key] = df[key+'_mapped']
            del df[key+'_mapped']
    return df



def cross_feat_plus(df,base_list,order=2):
    if order<2:
        return df
    subset = powerset(base_list)
    subset = [i for i in subset if len(i)==order]
    for sub in subset:
        sub = list(sub)
        baseName = '~'.join(sub)+'_plus'
        df[baseName] = df[sub].sum(axis=1)
    return df


def interaction_ratio(df,base_list=[],cal_list=[],rank_list = []):
    for base_var in base_list:
        if not isinstance(base_var,list):
            base_var = [base_var]
        if not '_'.join(base_var)+'_cnt' in df.columns:
            df = df.merge(df.groupby(base_var,as_index=False)['instance_id'].agg({'_'.join(base_var)+'_cnt':'count'}),'left',base_var)
        print('ratio part')
        for cal_var in cal_list:
            if not isinstance(cal_var,list):
                cal_var = [cal_var]
            if cal_var==base_var or base_var==['cnt_rec']:
                continue
            nameBase = '_'.join(base_var)+'~'+'_'.join(cal_var)
            print(nameBase)
            df = df.merge(df.groupby(base_var+cal_var,as_index=False)['instance_id'].agg({nameBase+'_cnt':'count'}),'left',base_var+cal_var)
            df[nameBase+'_ratio'] = df[nameBase+'_cnt']*1.0/df['_'.join(base_var)+'_cnt']
            del df[nameBase+'_cnt']
        
        print('rank part')
        for rank_var in rank_list:
            if not isinstance(rank_var,list):
                rank_var = [rank_var]
            if rank_var==base_var:
                continue
            nameBase = '_'.join(base_var)+'~'+'_'.join(rank_var)
            print(nameBase)
            df[nameBase+'_rank'] = dfAll.groupby(base_var)[rank_var].rank(method='min')
            df[nameBase+'_rank_ratio'] = df[nameBase+'_rank']*1.0/df['_'.join(base_var)+'_cnt']
            del df[nameBase+'_rank']
        del df['_'.join(base_var)+'_cnt']
    return df
            


<font color=#0099ff size=5 face="黑体">读取数据</font>

In [4]:
dfTrain = pd.read_table(config.TRAIN_FILE,sep=' ')
dfTrain.drop_duplicates(inplace=True)
dfTrain.reset_index(inplace=True,drop =True)
dfTest = pd.read_table(config.TEST_FILE,sep=' ')

dfTrain = process(dfTrain)
dfTest = process(dfTest)

dfAll = pd.concat([dfTrain,dfTest],axis=0)
dfAll.reset_index(inplace=True,drop=True)
trainNum = dfTrain.shape[0]
dfAll['cnt_rec'] = 1
print(dfAll.shape)

dfAll = labelencoder(dfAll)
dfAll = text_cosine(dfAll)
dfAll.shape

(520999, 45)


(520999, 49)

<font color=#0099ff size=5 face="黑体">特征工程</font>

In [5]:
###单特征map
dfAll = map_col(dfAll,True)
print(dfAll.shape)
featBase = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS]

(520999, 49)


In [6]:
###线下特征集合
dfAll['feat_set'] = dfAll['day'] + 1
keyList = ['user_id','shop_id','item_id','hour']
partList = [
    ['item_id','shop_id'],
    ['user_id','item_id'],
    ['user_id','shop_id'],
    ['user_id','item_id','shop_id']
]
meanList = [
    ['shop_id'],
    ['item_id'],
    [],
    ['user_id','shop_id','item_id']
]
for i in range(len(keyList)):
    keyVar = keyList[i]
    partVar = partList[i]
    meanVar = meanList[i]
    statVar = []
    if isinstance(keyVar,str):
        for key,value in config.STAT_DICT.items():
            if key==keyVar:
                continue
            statVar += value
    dfAll = _offline_feat(dfAll,keyVar,statVar,partVar,meanVar,['day','feat_set'])
del dfAll['feat_set']
print(dfAll.shape)

(520999, 177)


In [7]:
###连续型变量交叉特征
conList = [
    'user_gender_id','user_age_level', 'user_star_level',
    'item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level',
    'context_page_id',
    'shop_review_num_level','shop_star_level'
]
dfAll = cross_feat_plus(dfAll,conList,order=2)
dfAll = cross_feat_plus(dfAll,conList,order=3)
print(dfAll.shape)

(520999, 342)


In [8]:
###当天信息的trick
keyList = ['user_id']
#,'shop_id','item_id','item_city_id','item_brand_id'
for keyVar in keyList:
    '''timeDiff = False if keyVar=='user_id' else True
    dfAll = focus_one_record(dfAll,keyVar,timeDiff)'''
    dfAll = same_day_trick(dfAll,keyVar)
print(dfAll.shape)

(520999, 345)


In [None]:
###两两类别变量的比例/rank 顺序
'''baseList = [
    'cnt_rec',
    'user_id','user_gender_id', 'user_occupation_id','user_age_level', 'user_star_level',
    'item_id', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level',
    'item_category_list_bin1','item_category_list_bin2',
    'shop_id', 'shop_review_num_level','shop_star_level'
    
]

calList = [
    'user_id','user_gender_id', 'user_occupation_id','item_id', 'item_brand_id', 'item_city_id',
    'item_category_list_bin1','item_category_list_bin2','shop_id'
]
rankList = [
    'user_age_level', 'user_star_level','item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level','shop_review_num_level','shop_star_level'
]

dfAll = interaction_ratio(dfAll,baseList,calList,rankList)'''
dfCross = pd.read_csv('../../Data/advertisment/Cache/ratio_rank.csv')
dfAll = pd.concat([dfAll,dfCross],axis=1)

print(dfAll.shape)

In [None]:
###变量的多组合
featCorr = dfAll[featBase].corr('spearman')
for i in range(len(featBase)-1):
    for j in range(i+1,len(featBase)):
        try: tmpBound = featCorr.loc[featBase[i],featBase[j]]
        except: continue
        if abs(tmpBound)>=0.5:
            continue
        baseName = '~'.join([featBase[i],featBase[j]])+'_com'
        dfAll[baseName] = dfAll[[featBase[i],featBase[j]]].apply(lambda x: str(x[0])+'_'+str(x[1]),axis=1)
print(dfAll.shape)

#dfAll.to_csv("../../Data/advertisment/Cache/All_multi_combination.csv",index=False)

In [9]:
features = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS]

<font color=#0099ff size=5 face="黑体">拆分样本</font>

In [10]:
train_idx = dfTrain.loc[(dfTrain['day']<24)&(dfTrain['day']>18)].index
valid_idx = dfTrain.loc[dfTrain['day']==24].index
Xi_train_, y_train_ = dfAll.loc[list(train_idx),features],dfTrain.loc[train_idx,'is_trade']
Xi_valid_, y_valid_ = dfAll.loc[list(valid_idx),features],dfTrain.loc[valid_idx,'is_trade']
Xi_test_ = dfAll.loc[trainNum:,features]

<font color=#0099ff size=5 face="黑体">模型</font>

In [11]:
clf = lgb.LGBMClassifier(
    boosting_type = 'gbdt',
    num_leaves=35, 
    max_depth=8,
    n_estimators=20000,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)
clf.fit(Xi_train_[features], y_train_, eval_set=[(Xi_valid_[features], y_valid_)],feature_name = features,
        categorical_feature=[],early_stopping_rounds=100)
#[i for i in ['item_category_list_bin1','item_category_list_bin2'] if i in features]
y_score_ = clf.predict_proba(Xi_valid_[features],)[:, 1]

print(pd.Series(clf.feature_importances_, features).sort_values(ascending=False).reset_index())
print(log_loss(y_valid_, y_score_))
print(ks_metric(y_valid_, y_score_))
bstIter = clf.best_iteration_



[1]	valid_0's binary_logloss: 0.647686
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's binary_logloss: 0.606573
[3]	valid_0's binary_logloss: 0.569202
[4]	valid_0's binary_logloss: 0.535116
[5]	valid_0's binary_logloss: 0.503915
[6]	valid_0's binary_logloss: 0.475197
[7]	valid_0's binary_logloss: 0.448711
[8]	valid_0's binary_logloss: 0.424384
[9]	valid_0's binary_logloss: 0.401906
[10]	valid_0's binary_logloss: 0.381068
[11]	valid_0's binary_logloss: 0.361702
[12]	valid_0's binary_logloss: 0.343764
[13]	valid_0's binary_logloss: 0.327075
[14]	valid_0's binary_logloss: 0.311557
[15]	valid_0's binary_logloss: 0.297073
[16]	valid_0's binary_logloss: 0.283517
[17]	valid_0's binary_logloss: 0.270914
[18]	valid_0's binary_logloss: 0.259107
[19]	valid_0's binary_logloss: 0.248061
[20]	valid_0's binary_logloss: 0.237723
[21]	valid_0's binary_logloss: 0.228051
[22]	valid_0's binary_logloss: 0.218989
[23]	valid_0's binary_logloss: 0.210451
[24]	valid_0's binary_logl

[201]	valid_0's binary_logloss: 0.0806207
[202]	valid_0's binary_logloss: 0.0806132
[203]	valid_0's binary_logloss: 0.0806224
[204]	valid_0's binary_logloss: 0.0806266
[205]	valid_0's binary_logloss: 0.0806265
[206]	valid_0's binary_logloss: 0.0806274
[207]	valid_0's binary_logloss: 0.0806292
[208]	valid_0's binary_logloss: 0.080643
[209]	valid_0's binary_logloss: 0.0806307
[210]	valid_0's binary_logloss: 0.0806275
[211]	valid_0's binary_logloss: 0.0806277
[212]	valid_0's binary_logloss: 0.0806414
[213]	valid_0's binary_logloss: 0.0806425
[214]	valid_0's binary_logloss: 0.0806537
[215]	valid_0's binary_logloss: 0.0806515
[216]	valid_0's binary_logloss: 0.0806682
[217]	valid_0's binary_logloss: 0.0806753
[218]	valid_0's binary_logloss: 0.0806731
[219]	valid_0's binary_logloss: 0.0806682
[220]	valid_0's binary_logloss: 0.0806563
[221]	valid_0's binary_logloss: 0.0806531
[222]	valid_0's binary_logloss: 0.0806466
[223]	valid_0's binary_logloss: 0.0806396
[224]	valid_0's binary_logloss: 0.0

In [None]:
xx = pd.Series(clf.feature_importances_, features).sort_values(ascending=False).reset_index()

In [None]:
xx.loc[xx['index']=='property_gap1_cnt']

In [None]:
features = xx.loc[xx[0]>10,'index'].tolist()

In [None]:
(xx<10).sum()

In [None]:
def score_change(score,base_rate,real_rate):
    base_change = np.log(base_rate/(1-base_rate)) - np.log(real_rate/(1-real_rate))
    score_adj = np.exp(np.log(score/(1-score)) - base_change)/(np.exp(np.log(score/(1-score)) - base_change)+1)
    return score_adj

In [None]:
y_score_adj = score_change(y_score_,y_score_.mean(),y_valid_.mean())
print(log_loss(y_valid_, y_score_adj))

In [12]:
#Xi_finnal_ ,y_finnal_ = np.vstack((Xi_train_,Xi_valid_),np.hstack((y_train_,y_valid_))
Xi_finnal_ ,y_finnal_ = pd.concat([Xi_train_,Xi_valid_]), pd.concat([y_train_,y_valid_])
clf = lgb.LGBMClassifier(
    num_leaves=35, 
    max_depth=8,
    n_estimators=bstIter,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)
clf.fit(Xi_finnal_, y_finnal_,
        categorical_feature=[])
#[i for i in ['item_category_list_bin1','item_category_list_bin2'] if i in features]
y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
y_test_meta[:,0] += clf.predict_proba(Xi_test_)[:,1]
submit = pd.DataFrame({'instance_id':dfTest['instance_id'],'predicted_score':y_test_meta[:,0]})
#submit.to_csv('../../Submission/advertisement/gbm_trick_0330.txt', sep=" ", index=False, line_terminator='\n')



In [13]:
submit['predicted_score'].mean()

0.018304112638721327

In [None]:
y_finnal_.mean()

In [None]:
submit = pd.read_csv('../../Submission/advertisement/gbm_trick_text_417.txt',sep=" ")

In [14]:
#submit['predicted_score'] = submit['predicted_score'] - submit['predicted_score'].mean() + 0.016983086400719192
#submit.loc[submit['predicted_score']<0,'predicted_score'] =0
submit.to_csv('../../Submission/advertisement/gbm_trick_testb_418.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit['predicted_score'] = 0
submit.to_csv('../../Submission/advertisement/gbm_trick_testb_418.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit['predicted_score'] = score_change(submit['predicted_score'],y_finnal_.mean(),0.016983086400719192)
submit['predicted_score'].mean()

In [None]:
submit.to_csv('../../Submission/advertisement/gbm_trick_text_adj2_417.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
np.hstack((y_train_,y_valid_))

In [None]:
y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)

In [None]:
submit = pd.DataFrame({'instance_id':dfTest['instance_id'],'predicted_score':y_test_meta[:,0]})

In [None]:
submit['predicted_score'] = 1.0
submit.to_csv('../../Submission/advertisement/test_0330.txt', sep=" ", index=False, line_terminator='\n')