In [None]:
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import random
import config
import re
import os
import gc
import pickle
import scipy.special as special

from math import log
from numba import jit
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import log_loss,roc_curve
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from itertools import chain, combinations

In [None]:
def get_cosine(vec1, vec2):
    vec1=Counter(vec1)
    vec2=Counter(vec2)
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator
            
def timestamp_datetime(value):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(value))

def time_feat(df,featList,featName):
    df[featName] = df.groupby(featList)['context_timestamp'].rank(method='first')   
    return df

def powerset(iterable):
    s = list(iterable)
    return list(chain.from_iterable(combinations(s, r) for r in range(len(s)+1)))

def del_na(lst):
    out = ''
    if len(lst)<2:
        return out        
    for i in range(0,len(lst),2):
        if not lst[i+1]=='-1':
            out += lst[i]+':'+lst[i+1]+';'
    try:  return out[:-1]
    except: return out

def ks_metric(true,score):
    fpr, tpr, thresholds = roc_curve(true,score)
    ks = max(tpr-fpr)
    return ks 

def score_change(score,base_rate,real_rate):
    base_change = np.log(base_rate/(1-base_rate)) - np.log(real_rate/(1-real_rate))
    score_adj = np.exp(np.log(score/(1-score)) - base_change)/(np.exp(np.log(score/(1-score)) - base_change)+1)
    return score_adj

In [None]:
def woe(df):
    consRate = 1.0*df['target'].sum()/df.loc[df['target']==0,'target'].count()
    woe1 = df.loc[df['value']>0,'target'].sum()/df.loc[(df['value']>0)&(df['target']==0),'target'].count()/consRate
    woe2 = df.loc[df['value']==0,'target'].sum()/df.loc[(df['value']==0)&(df['target']==0),'target'].count()/consRate
    
    iv1 = (1.0*df.loc[df['value']>0,'target'].sum()/df['target'].sum() - 1.0*df.loc[(df['value']>0)&(df['target']==0),'target'].count()/df.loc[df['target']==0,'target'].count())*woe1
    iv2 = (1.0*df.loc[df['value']==0,'target'].sum()/df['target'].sum() - 1.0*df.loc[(df['value']==0)&(df['target']==0),'target'].count()/df.loc[df['target']==0,'target'].count())*woe2
    
    totalIV = iv1+iv2
    return totalIV
    
def avg_property(input):
    if len(input)==0:
        return 0 
    tmp = [i.count(',')+1 for i in input]
    return sum(tmp)/len(tmp)    
    

In [None]:
def process(df):
    df['time'] = df.context_timestamp.apply(timestamp_datetime)
    df['day'] = df.time.apply(lambda x: int(x[8:10]))
    df['hour'] = df.time.apply(lambda x: int(x[11:13]))
    #df['min'] = df.time.apply(lambda x: int(x[14:16]))

    df['item_property_list'] = df['item_property_list'].apply(lambda x:';'.join(sorted(set(str(x).split(';')))))
    df['predict_category_property'] = df['predict_category_property'].apply(lambda x:';'.join(sorted(set(str(x).split(';')))))
    df['predict_category_property'] =df['predict_category_property'].apply(lambda x: list(re.split('[:;]',x)))
    df['predict_category_property'] = df['predict_category_property'].map(del_na)
    #df['len_item_property_list'] = df['item_property_list'].apply(lambda x: len(str(x).split(';')))
    #df['len_predict_category_property'] = df['predict_category_property'].apply(lambda x: len(str(x).split(';')))    
    for var in ['time']:
        del df[var]
    #df["missing_feat"] = np.sum((df == -1).values, axis=1)
    return df

def labelencoder(df):
    lbl = LabelEncoder()
    for var in ['user_id','item_id','shop_id','item_brand_id','item_city_id']:
        try: df[var] = lbl.fit_transform(df[var])
        except: print('column %s is not exist'%var)
    return df     

def text_feat(df):
    df['tmp_cate'] = df['item_category_list'].apply(lambda x: x.split(';')[2] if len(x.split(';'))>2 else x.split(';')[1])
    df['cate_predict_chk']=list(map(lambda x,y: 1 if x in str(y) else 0,df['tmp_cate'],df['predict_category_property']))
    
    
    df['tmp_set_predict_property'] =df['predict_category_property'].apply(lambda x: re.split('[:;]',str(x))[1::2])   
    df['tmp_set_predict_cate'] =df['predict_category_property'].apply(lambda x: (re.split('[:;]',str(x))[::2]))
    df['tmp_set_item_property_list'] =df['item_property_list'].apply(lambda x: set(re.split('[;]',x)))
                                                                      
    
    df['cate_predict_common_property']=list(map(lambda x,y,m,n: len(n&set(m[y.index(x)].split(','))) if x in y else 0 , df['tmp_cate'],df['tmp_set_predict_cate'],df['tmp_set_predict_property'],df['tmp_set_item_property_list']))
    del df['tmp_cate']
    del df['tmp_set_predict_cate']
    
    df['tmp_total_set_predict_property'] =df['tmp_set_predict_property'].apply(lambda x: set(','.join(x).split(',')))
    
    #df['property_predict_avg_cnt'] = df['tmp_set_predict_property'].apply(lambda x:[i.count(',')] for i in x)
    df['property_predict_avg_cnt'] = df['tmp_set_predict_property'].map(avg_property)
    del df['tmp_set_predict_property']
    
    
    df['property_join_cnt'] = df[['tmp_total_set_predict_property','tmp_set_item_property_list']].apply(lambda x: len(x[0]&x[1])*1.0/len(x[0]|x[1]),axis=1)
    df['property_gap1_cnt'] = df[['tmp_total_set_predict_property','tmp_set_item_property_list']].apply(lambda x: len(x[0]-x[1])*1.0/len(x[0]|x[1]),axis=1)
    df['property_gap2_cnt'] = df[['tmp_total_set_predict_property','tmp_set_item_property_list']].apply(lambda x: len(x[1]-x[0])*1.0/len(x[0]|x[1]),axis=1)
    del df['tmp_total_set_predict_property']
    del df['tmp_set_item_property_list']
    
    return df



<font color=#0099ff size=5 face="黑体">读取数据</font>

In [None]:
featureDtypes = {'cnt_rec': 'int8',
 'context_id': 'int64',
 'context_page_id': 'int16',
 'context_timestamp': 'int32',
 'day': 'int8',
 'hour': 'int8',
 'instance_id': 'int64',
 'is_trade': 'float32',
 'item_brand_id': 'int16',
 'item_category_list_bin1': 'int8',
 'item_category_list_bin2': 'int8',
 'item_city_id': 'int16',
 'item_collected_level': 'int8',
 'item_id': 'int32',
 'item_price_level': 'int8',
 'item_pv_level': 'int8',
 'item_sales_level': 'int8',
 'len_item_property_list': 'int8',
 'len_predict_category_property': 'int8',
 'min': 'int8',
 'shop_id': 'int16',
 'shop_review_num_level': 'int8',
 'shop_review_positive_rate': 'float32',
 'shop_score_delivery': 'float32',
 'shop_score_description': 'float32',
 'shop_score_service': 'float32',
 'shop_star_level': 'int16',
 'user_age_level': 'int16',
 'user_gender_id': 'int8',
 'user_id': 'int32',
 'user_occupation_id': 'int16',
 'user_star_level': 'int16'}

if not os.path.exists(config.FEATURE_TEXT_SET):
    dfTrain = pd.read_table(config.TRAIN_FILE,sep=' ',usecols=[0,2,3,16,18,26])
    dfTrain.drop_duplicates(inplace=True)
    dfTrain.reset_index(inplace=True,drop =True)
    dfTrain = process(dfTrain)
    dfTest = pd.read_table(config.TEST_FILE,sep=' ',usecols=[0,2,3,16,18])
    dfTest = process(dfTest)
    dfTrain.loc[dfTrain['day'] == 31,'day'] = 0
    dfAll = pd.concat([dfTrain,dfTest],axis=0)
    dfAll.reset_index(inplace=True,drop=True)
    del dfTrain
    del dfTest
    dfAll['cnt_rec'] = 1
    dfAll = labelencoder(dfAll)
    dfSet = dfAll.loc[dfAll['day']==7]
    dfBase = dfAll.loc[dfAll['day']!=7]
    dfSet.to_csv(config.FEATURE_TEXT_SET,sep=' ',index=False, line_terminator='\n')
    dfBase.to_csv(config.FEATURE_TEXT_BASE,sep=' ',index=False, line_terminator='\n')
    del dfAll
else:
    dfSet = pd.read_table(config.FEATURE_TEXT_SET,sep=' ',dtype=featureDtypes)
    #dfBase = pd.read_table(config.FEATURE_TEXT_BASE,sep=' ',dtype=featureDtypes)

    
'''for var in dfSet:
    if var not in ['shop_review_positive_rate','shop_score_delivery','shop_score_description','shop_score_service']:
        print(var)
        dfSet[var] = pd.to_numeric(dfSet[var],downcast='signed')
        
for var in dfSet:
    if var in ['shop_review_positive_rate','shop_score_delivery','shop_score_description','shop_score_service']:
        dfSet[var] = pd.to_numeric(dfSet[var],downcast='float')'''

In [None]:
dataRootDir = '../../Data/advertisment/Cache/'

In [None]:
dfSet = text_feat(dfSet)
toSave = dfSet.iloc[:,9:]
toSave.to_csv(dataRootDir + 'text_base.csv',index=False)

In [None]:
cnt = CountVectorizer()
propertyList = cnt.fit_transform(dfSet['item_property_list'])

In [None]:
dfvoc = pd.DataFrame(cnt.vocabulary_,index=[1])
voc = np.array(dfvoc.columns)
del dfvoc

In [None]:
cutoff = 50
propertyList = propertyList[:, np.array(np.clip(propertyList[:1077175,:].getnnz(axis=0) -cutoff, 0, 1), dtype=bool)]
voc = voc [np.array(np.clip(propertyList[:1077175,:].getnnz(axis=0) -cutoff, 0, 1), dtype=bool)]
propertyList.shape

In [None]:
info = {'var':[],'iv':[]}
for i in range(len(voc)):
    print(i)
    iv = woe(pd.DataFrame({'value':np.array(propertyList[:1077175,i].todense()).reshape(1077175),'target':dfSet.loc[:1077174,'is_trade']}))
    info['var'].append(voc[i])
    info['iv'].append(iv)
    with open(dataRootDir+'iv_test.pkl','wb') as f:
        pickle.dump(info,f)

In [None]:
word_imp = pd.DataFrame(info)

In [None]:
word_imp.iv.max()

In [None]:
word_imp.loc[word_imp['iv'] == word_imp['iv'].max()]

In [None]:
word_imp.sort_values('iv',0,False)

In [None]:
np.array(propertyList[:1077175,0].todense()).reshape(1077175).shape

<font color=#0099ff size=5 face="黑体">特征工程</font>

In [None]:
train_idx = dfSet.loc[(dfSet['hour']>0)&(dfSet['hour']<10)].index
valid_idx = dfSet.loc[(dfSet['hour']>=10)&(dfSet['hour']<12)].index

y_train = dfSet.loc[train_idx,'is_trade']
y_valid = dfSet.loc[valid_idx,'is_trade']

In [None]:
cut_off = 50
cnt = CountVectorizer()
propertyList = cnt.fit_transform(dfSet['item_property_list'])
propertyList = propertyList[:, np.array(np.clip(propertyList[:1077175,:].getnnz(axis=0) -cutoff, 0, 1), dtype=bool)]

In [None]:
clf = lgb.LGBMClassifier(
    boosting_type = 'gbdt',
    num_leaves=40, 
    max_depth=8,
    n_estimators=20000,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)
clf.fit(propertyList[train_idx,:], y_train, eval_set=[(propertyList[valid_idx,:],y_valid)],
        categorical_feature=[],early_stopping_rounds=100)
#[i for i in ['item_category_list_bin1','item_category_list_bin2'] if i in features]
y_score_ = clf.predict_proba(propertyList[valid_idx,:],)[:, 1]

print(pd.Series(clf.feature_importances_, features).sort_values(ascending=False).reset_index())
print(log_loss(y_valid_, y_score_))
print(ks_metric(y_valid_, y_score_))
bstIter = clf.best_iteration_

In [None]:
all_idx = pd.concat([train_idx,valid_idx])
y_all = pd.concat([y_train,y_valid])

clf = lgb.LGBMClassifier(
    num_leaves=40, 
    max_depth=8,
    n_estimators=bstIter,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)

clf.fit(propertyList[all_idx,:], y_all,
        categorical_feature=[])

cnt_pred_score = clf.predict_proba(propertyList)[:,1]

In [None]:
#dataRootDir = '/data/5/data/maoli/learn/advertisement/Cache/'
dataRootDir = '../../Data/advertisment/Cache/'
dfAll = dfSet

In [None]:
###单特征map
dfAll = map_col(dfAll,True); gc.collect()
print(dfAll.shape)
featBound = dfAll.shape[1]
#featBase = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS]

In [None]:
###平滑后CTR
#keyList = ['item_id']
keyList = config.CATEGORICAL_COLS



if os.path.exists(dataRootDir + 'smooth.csv'):
    dfSmooth = pd.read_csv(dataRootDir + 'smooth.csv')
    dfAll = pd.concat([dfAll,dfSmooth],axis=1)
    del dfSmooth
else:
    dfAll = smooth_ctr(dfAll,None,keyList); gc.collect()
    dfAll = smooth_ctr(dfAll,dfBase,keyList); gc.collect()
    toSave = dfAll.iloc[:,featBound:]
    toSave.head()
    toSave.to_csv(dataRootDir + 'smooth.csv',index=False)

print(dfAll.shape)
#dfAll = dfAll.iloc[:,:featBound]

In [None]:
###平滑后CTR
#keyList = ['item_id']
keyList = [list(i) for i in powerset(config.CATEGORICAL_COLS) if len(i)==2 and not 'user_id' in i]

if os.path.exists(dataRootDir + 'smooth_2order.csv'):
    dfSmooth = pd.read_csv(dataRootDir + 'smooth_2order.csv')
    dfAll = pd.concat([dfAll,dfSmooth],axis=1)
    del dfSmooth
else:
    dfAll = smooth_ctr(dfAll,None,keyList); gc.collect()
    dfAll = smooth_ctr(dfAll,dfBase,keyList); gc.collect()
    toSave = dfAll.iloc[:,featBound:]
    toSave.head()
    toSave.to_csv(dataRootDir + 'smooth_2order.csv',index=False)

print(dfAll.shape)
#dfAll = dfAll.iloc[:,:featBound]

In [None]:
###线下特征集合
keyList = ['user_id','shop_id','item_id','hour','item_category_list_bin1']
partList = [
    ['item_id','shop_id'],
    ['user_id','item_id'],
    ['user_id','shop_id'],
    ['user_id','item_id','shop_id'],
    ['user_id','item_id','shop_id']
]
meanList = [
    ['shop_id'],
    ['item_id'],
    [],
    ['user_id','shop_id','item_id'],
    ['user_id','shop_id','item_id']
]

if os.path.exists(dataRootDir + 'offline.csv'):
    dfOffline = pd.read_csv(dataRootDir + 'offline.csv')
    dfAll = pd.concat([dfAll,dfOffline],axis=1)
    del dfOffline
else:
    for i in range(len(keyList)):
    #for i in range(0,1):
        keyVar = keyList[i]
        partVar = partList[i]
        meanVar = meanList[i]
        statVar = []
        if isinstance(keyVar,str):
            for key,value in config.STAT_DICT.items():
                if key==keyVar:
                    continue
                statVar += value
        if not 'hour' in keyVar:
            dfAll = _offline_feat(dfAll,None,keyVar,statVar,partVar,meanVar); gc.collect()
        dfAll = _offline_feat(dfAll,dfBase,keyVar,statVar,partVar,meanVar); gc.collect()
    toSave = dfAll.iloc[:,featBound:]
    toSave.head()
    toSave.to_csv(dataRootDir + 'offline.csv',index=False)

print(dfAll.shape)
#dfAll = dfAll.iloc[:,:featBound]


In [None]:
###连续型变量交叉特征
conList = [
    'user_gender_id','user_age_level', 'user_star_level',
    'item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level',
    'context_page_id',
    'shop_review_num_level','shop_star_level'
]

if os.path.exists(dataRootDir + 'cross_plus.csv'):
    dfCrossPlus = pd.read_csv(dataRootDir + 'cross_plus.csv')
    dfAll = pd.concat([dfAll,dfCrossPlus],axis=1)
    del dfCrossPlus
else:
    dfAll = cross_feat_plus(dfAll,conList,order=2); gc.collect()
    dfAll = cross_feat_plus(dfAll,conList,order=3); gc.collect()
    toSave = dfAll.iloc[:,featBound:]
    toSave.head()
    toSave.to_csv(dataRootDir + 'cross_plus.csv',index=False)
print(dfAll.shape)
#dfAll = dfAll.iloc[:,:featBound]

In [None]:
###当天信息的trick
keyList = ['user_id'] + [['user_id',i] for i in config.CATEGORICAL_COLS if i!='user_id']

if os.path.exists(dataRootDir + 'trick_userid.csv'):
    dfTrick = pd.read_csv(dataRootDir + 'trick_userid.csv')
    dfAll = pd.concat([dfAll,dfTrick],axis=1)
    del dfTrick
else:
    for keyVar in keyList:
        dfAll = same_day_trick(dfAll,keyVar); gc.collect()
    toSave = dfAll.iloc[:,featBound:]
    toSave.head()
    toSave.to_csv(dataRootDir + 'trick_userid.csv',index=False)
print(dfAll.shape)
#dfAll = dfAll.iloc[:,:featBound]

In [None]:
###两两类别变量的比例/rank 顺序
baseList = [
    'cnt_rec',
    'user_id','user_gender_id', 'user_occupation_id','user_age_level', 'user_star_level',
    'item_id', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level',
    'item_category_list_bin1','item_category_list_bin2',
    'shop_id', 'shop_review_num_level','shop_star_level'
    
]

calList = [
    'user_id','user_gender_id', 'user_occupation_id','item_id', 'item_brand_id', 'item_city_id',
    'item_category_list_bin1','item_category_list_bin2','shop_id'
]
rankList = [
    'user_age_level', 'user_star_level','item_price_level', 'item_sales_level','item_collected_level', 'item_pv_level','shop_review_num_level','shop_star_level'
]


if os.path.exists(dataRootDir + 'ratio_rank.csv'):
    dfRank = pd.read_csv(dataRootDir + 'ratio_rank.csv')
    dfAll = pd.concat([dfAll,dfRank],axis=1)
    del dfRank
else:
    dfAll = interaction_ratio(dfAll,None,baseList,calList,rankList); gc.collect()
    #dfAll = interaction_ratio(dfAll,dfBase,baseList,calList,rankList); gc.collect()
    toSave = dfAll.iloc[:,featBound:]
    toSave.head()
    toSave.to_csv(dataRootDir + 'ratio_rank.csv',index=False)
print(dfAll.shape)
#dfAll = dfAll.iloc[:,:featBound]


if os.path.exists(dataRootDir + 'ratio_rank_preday.csv'):
    dfRank = pd.read_csv(dataRootDir + 'ratio_rank_preday.csv')
    dfAll = pd.concat([dfAll,dfRank],axis=1)
    del dfRank
else:
    #dfAll = interaction_ratio(dfAll,None,baseList,calList,rankList); gc.collect()
    dfAll = interaction_ratio(dfAll,dfBase,baseList,calList,rankList); gc.collect()
    toSave = dfAll.iloc[:,featBound:]
    toSave.head()
    toSave.to_csv(dataRootDir + 'ratio_rank_preday.csv',index=False)
print(dfAll.shape)
#dfAll = dfAll.iloc[:,:featBound]

<font color=#0099ff size=5 face="黑体">拆分样本</font>

In [None]:
features = [i for i in dfAll.columns.tolist() if not i in config.IGNORE_COLS]

train_idx = dfAll.loc[(dfAll['hour']<10)&(dfAll['hour']>0)].index
valid_idx = dfAll.loc[(dfAll['hour']<12)&(dfAll['hour']>9)].index
Xi_train_, y_train_ = dfAll.loc[list(train_idx),features],dfTrain.loc[train_idx,'is_trade']
Xi_valid_, y_valid_ = dfAll.loc[list(valid_idx),features],dfTrain.loc[valid_idx,'is_trade']
Xi_test_ = dfAll.loc[(dfAll['hour']>=12),features]

del dfAll

<font color=#0099ff size=5 face="黑体">模型</font>

In [None]:
clf = lgb.LGBMClassifier(
    boosting_type = 'gbdt',
    num_leaves=40, 
    max_depth=8,
    n_estimators=20000,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)
clf.fit(Xi_train_[features], y_train_, eval_set=[(Xi_valid_[features], y_valid_)],feature_name = features,
        categorical_feature=[],early_stopping_rounds=100)
#[i for i in ['item_category_list_bin1','item_category_list_bin2'] if i in features]
y_score_ = clf.predict_proba(Xi_valid_[features],)[:, 1]

print(pd.Series(clf.feature_importances_, features).sort_values(ascending=False).reset_index())
print(log_loss(y_valid_, y_score_))
print(ks_metric(y_valid_, y_score_))
bstIter = clf.best_iteration_

In [None]:
xx = pd.Series(clf.feature_importances_, features).sort_values(ascending=False).reset_index()
(xx[0]==0).sum()

In [None]:
features = xx.loc[xx[0]>0,'index'].tolist()

In [None]:
def score_change(score,base_rate,real_rate):
    base_change = np.log(base_rate/(1-base_rate)) - np.log(real_rate/(1-real_rate))
    score_adj = np.exp(np.log(score/(1-score)) - base_change)/(np.exp(np.log(score/(1-score)) - base_change)+1)
    return score_adj

In [None]:
del Xi_train_
del Xi_valid_

In [None]:
Xi_finnal_.shape

In [None]:
#Xi_finnal_ ,y_finnal_ = np.vstack((Xi_train_,Xi_valid_),np.hstack((y_train_,y_valid_))
Xi_finnal_ ,y_finnal_ = pd.concat([Xi_train_,Xi_valid_]), pd.concat([y_train_,y_valid_])
del Xi_train_
del Xi_valid_

clf = lgb.LGBMClassifier(
    num_leaves=40, 
    max_depth=8,
    n_estimators=bstIter,
    n_jobs=20,
    learning_rate=0.05,
    colsample_bytree=0.8,
    subsample=0.9,
    max_bin=20
)
clf.fit(Xi_finnal_[features], y_finnal_,feature_name = features,
        categorical_feature=[])
#[i for i in ['item_category_list_bin1','item_category_list_bin2'] if i in features]
y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
y_test_meta[:,0] += clf.predict_proba(Xi_test_[features])[:,1]
submit = pd.DataFrame({'instance_id':dfTest['instance_id'],'predicted_score':y_test_meta[:,0]})
#submit.to_csv('../../Submission/advertisement/gbm_trick_0330.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
y_test_meta[:,0] += clf.predict_proba(Xi_test_[features])[:,1]
submit = pd.DataFrame({'instance_id':dfTest['instance_id'],'predicted_score':y_test_meta[:,0]})

In [None]:
submit = submit.loc[submit['instance_id'].isin(idSubmit)]
submit['predicted_score'].mean()

In [None]:
y_finnal_.mean()

In [None]:
submit.to_csv('../../Submission/advertisement/gbm_trick_testb_419.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit['predicted_score'] = 0
submit.to_csv('../../Submission/advertisement/gbm_trick_testb_418.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit['predicted_score'] = score_change(submit['predicted_score'],submit['predicted_score'].mean(),0.018116956)
print(submit['predicted_score'].mean())
submit.to_csv('../../Submission/advertisement/gbm_trick_testb_adj_419.txt', sep=" ", index=False, line_terminator='\n')

In [None]:
submit = pd.read_csv('../../Submission/advertisement/gbm_trick_text_417.txt',sep=" ")