In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
import nltk
import pickle
from nltk import pos_tag
from scipy.sparse import hstack, vstack
from scipy import spatial
from sklearn.datasets import dump_svmlight_file,load_svmlight_file
from sklearn.decomposition import TruncatedSVD
import xgboost as xgb
import gensim
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
import gc
# 显示cell运行时长
%load_ext klab-autotime

In [3]:
feature_path = "/home/kesci/work/counting_feature/train/"
data_path = "/home/kesci/input/bytedance/first-round/"
colnames = ["query_id","query","query_title_id","title","label"]

time: 672 µs


In [4]:
#  计算统计的相似度特征 (雅克比相似度,dice相似度)
#  计算统计的距离-index-特征 (min,max,median,avg,std)
"""
以下特征 unigram,bigram,trigram
"""
def try_divide(x, y, val=0.0):
    if y != 0.0:
        val = float(x) / y
    return val
    
def JaccardCoef(A, B):
    A, B = set(A), set(B)
    intersect = len(A.intersection(B))
    union = len(A.union(B))
    coef = try_divide(intersect, union)
    return coef
    
def JaccardCoef_Join(A, B):
    A, B = set(A), set(B)
    return len(A.intersection(B))
    
def DiceDist(A, B):
    A, B = set(A), set(B)
    intersect = len(A.intersection(B))
    union = len(A) + len(B)
    d = try_divide(2*intersect, union)
    return d

def getUnigram(words):
    """
        Input: a list of words, e.g., ['I', 'am', 'Denny']
        Output: a list of unigram
    """
    assert type(words) == list
    return words

def getBigram(words, join_string="_", skip=0):
    """
       Input: a list of words, e.g., ['I', 'am', 'Denny']
       Output: a list of bigram, e.g., ['I_am', 'am_Denny']
       I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L-1):
            for k in range(1,skip+2):
                if i+k < L:
                    lst.append( join_string.join([words[i], words[i+k]]) )
    else:
        # set it as unigram
        lst = getUnigram(words)
    return lst
    
def getTrigram(words, join_string="_", skip=0):
    """
       Input: a list of words, e.g., ['I', 'am', 'Denny']
       Output: a list of trigram, e.g., ['I_am_Denny']
       I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 2:
        lst = []
        for i in range(L-2):
            for k1 in range(1,skip+2):
                for k2 in range(1,skip+2):
                    if i+k1 < L and i+k1+k2 < L:
                        lst.append( join_string.join([words[i], words[i+k1], words[i+k1+k2]]) )
    else:
        # set it as bigram
        lst = getBigram(words, join_string, skip)
    return lst

def wc_ratio_intersect(A,B):
    wc_A = 0
    wc_B = 0
    for a in A:
        if a in B:
            wc_A += 1
    for b in B:
        if b in A:
            wc_B += 1
    ratio_A = float(wc_A)/len(A)
    ratio_B = float(wc_B)/len(B)
    return wc_A,wc_B,ratio_A,ratio_B

    
def trigram_intersect_func(query,title):
    my_list = []
    # unigram
    wc_unigram_q,wc_unigram_t,ratio_unigram_q,ratio_unigram_t = wc_ratio_intersect(query,title)
    # bigram
    wc_bigram_q,wc_bigram_t,ratio_bigram_q,ratio_bigram_t = wc_ratio_intersect(getBigram(query),getBigram(title))
    # trigram
    wc_trigram_q,wc_trigram_t,ratio_trigram_q,ratio_trigramt = wc_ratio_intersect(getTrigram(query),getTrigram(title))
    my_list = [wc_unigram_q,wc_unigram_t,ratio_unigram_q,ratio_unigram_t,
               wc_bigram_q,wc_bigram_t,ratio_bigram_q,ratio_bigram_t,
               wc_trigram_q,wc_trigram_t,ratio_trigram_q,ratio_trigramt
              ]
    return my_list

def trigram_distance_func(query,title):
    my_list = []
    # unigram
    dis_unigram_j = JaccardCoef(query,title)
    dis_unigram_d = DiceDist(query,title)
    # bigram
    dis_bigram_j = JaccardCoef(getBigram(query),getBigram(title))
    dis_bigram_d = DiceDist(getBigram(query),getBigram(title))
    # trigram
    dis_trigram_j = JaccardCoef(getTrigram(query),getTrigram(title))
    dis_trigram_d = DiceDist(getTrigram(query),getTrigram(title))
    my_list = [dis_unigram_j,dis_unigram_d,
               dis_bigram_j,dis_bigram_d,
               dis_trigram_j,dis_trigram_d
              ]
    return my_list
    
def trigram_length_func(query,title):
    my_list = []
    # unigram
    dis_unigram = JaccardCoef_Join(query,title)
    # bigram
    dis_bigram = JaccardCoef_Join(getBigram(query),getBigram(title))
    # trigram
    dis_trigram = JaccardCoef_Join(getTrigram(query),getTrigram(title))
    my_list = [dis_unigram,
               dis_bigram,
               dis_trigram
              ]
    return my_list    
    
#  相似性度量   最长公共子序列长度 & 最长公共子串的长度
def lcs_subseq(X, Y): 
    m = len(X) 
    n = len(Y) 
    L = [[None]*(n + 1) for i in range(m + 1)] 
    for i in range(m + 1): 
        for j in range(n + 1): 
            if i == 0 or j == 0 : 
                L[i][j] = 0
            elif X[i-1] == Y[j-1]: 
                L[i][j] = L[i-1][j-1]+1
            else: 
                L[i][j] = max(L[i-1][j], L[i][j-1]) 
    return L[m][n] 

def lcs_substring(X, Y): 
    m = len(X) 
    n = len(Y) 
    LCSuff = [[0 for k in range(n+1)] for l in range(m+1)] 
    result = 0 
    # Following steps to build 
    # LCSuff[m+1][n+1] in bottom up fashion 
    for i in range(m + 1): 
        for j in range(n + 1): 
            if (i == 0 or j == 0): 
                LCSuff[i][j] = 0
            elif (X[i-1] == Y[j-1]): 
                LCSuff[i][j] = LCSuff[i-1][j-1] + 1
                result = max(result, LCSuff[i][j]) 
            else: 
                LCSuff[i][j] = 0
    return result

def get_unigram_LCS(X,Y):
    return [lcs_subseq(X,Y),lcs_substring(X,Y)]    

time: 19.8 ms


In [5]:
basic_feature_path = feature_path+"basic_feature.csv"
train_data = "/home/kesci/input/bytedance/first-round/train.csv"
samples = 100000000
chunksize = 5000000
skip_num = int(samples/chunksize) - 1
print(skip_num)

19
time: 2.42 ms


In [6]:
# # #####################################################
# # #####  距离特征 & 统计的特征 后500w   用于Train
# # #####################################################

# basic_feature_fout = open(basic_feature_path,"w")
# df = pd.read_csv(train_data, names=colnames, header=None,skiprows=chunksize*skip_num,nrows=chunksize,lineterminator="\n")
# basic_feature = ""

# for index, row in df.iterrows():
#     query_unigram = row[1].split(" ")
#     title_unigram = row[3].split(" ") 
#     # 0 长度特征 (4 +3 dim)
#     length_q = len(query_unigram)
#     length_t = len(title_unigram)
#     length_diff = len(title_unigram)-len(query_unigram)
#     length_divide = float(len(query_unigram))/len(title_unigram)
#     length_trigram = trigram_length_func(query_unigram,title_unigram)
#     # 0 LCS & substring
#     LCS_feature = get_unigram_LCS(query_unigram,title_unigram)
#     # 1 距离特征(2*3 dim)
#     trigram_distance = trigram_distance_func(query_unigram,title_unigram)
#     # 2 共现单词特征(4*3 dim)
#     trigram_intersect = trigram_intersect_func(query_unigram,title_unigram)

#     dim_hstack = np.hstack((length_q,length_t,length_diff,length_divide,
#                             LCS_feature,
#                             length_trigram,
#                             trigram_distance,
#                             trigram_intersect,
#                           ))
#     if basic_feature == "":
#         basic_feature = dim_hstack
#     else:
#         basic_feature = np.vstack((basic_feature,dim_hstack))
#     if (index+1) % 100000== 0:
#         print("write_file: ",basic_feature.shape)
#         np.savetxt(basic_feature_fout,basic_feature,delimiter=",",fmt="%.5f")
#         basic_feature = ""
    
# basic_feature_fout.flush()
# del df
# gc.collect()
# # 27 dim



write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file: 

336

time: 3h 9min 55s


In [5]:
# # #####################################################
# # #####  位置特征 后500w   用于Train
# # #####################################################
# feature_path = "/home/kesci/work/counting_feature/train/"
# train_data = "/home/kesci/input/bytedance/first-round/train.csv"
# skip_num = int(samples/chunksize) - 1
# print(skip_num)
# def get_position_list(target, obs):
#     """
#         Get the list of positions of obs in target
#         属于obs & 存在于taget的单词位于obs中的位置
#         df每一行,返回一个list  (map函数)
#     """
#     target = target.split(" ")
#     obs = obs.split(" ")
#     pos_of_obs_in_target = [0]
#     if len(obs) != 0:
#         pos_of_obs_in_target = [j for j,w in enumerate(obs, start=1) if w in target]
#         if len(pos_of_obs_in_target) == 0:
#             pos_of_obs_in_target = [0]
#     return pos_of_obs_in_target

# postation_fout = open(feature_path + "postation.csv","w")
# # 1.query
# df = pd.read_csv(train_data, names=colnames, header=None,skiprows=chunksize*skip_num,nrows=chunksize,lineterminator="\n")
# pos = list(df.apply(lambda x: get_position_list(x["title"], obs=x["query"]), axis=1)) #TODO： 1 or ‘columns’: apply function to each row
# query_hstack = np.hstack((
#                 np.array(list(map(np.min, pos))).reshape(-1,1),
#                 np.array(list(map(np.mean, pos))).reshape(-1,1),
#                 np.array(list(map(np.median, pos))).reshape(-1,1),
#                 np.array(list(map(np.max, pos))).reshape(-1,1),
#                 np.array(list(map(np.std, pos))).reshape(-1,1)         
#     ))
# print("query ",query_hstack.shape)
# # 2.title
# pos = list(df.apply(lambda x: get_position_list(x["query"], obs=x["title"]), axis=1)) #TODO： 1 or ‘columns’: apply function to each row
# title_hstack = np.hstack((
#                 np.array(list(map(np.min, pos))).reshape(-1,1),
#                 np.array(list(map(np.mean, pos))).reshape(-1,1),
#                 np.array(list(map(np.median, pos))).reshape(-1,1),  #重要
#                 np.array(list(map(np.max, pos))).reshape(-1,1),
#                 np.array(list(map(np.std, pos))).reshape(-1,1)         
#     ))
# print("title ",title_hstack.shape)
# # 3.write to file
# hstack_all =  np.hstack((query_hstack,title_hstack))
# np.savetxt(postation_fout,hstack_all,delimiter=",",fmt="%.5f")
# postation_fout.flush()

19
query  (5000000, 5)
title  (5000000, 5)
time: 22min 15s


In [None]:
# #####################################################
# #####  Validation 验证集 
# #####################################################
# #####################################################
# #####  Validation 验证集 
# #####################################################
# #####################################################
# #####  Validation 验证集 
# #####################################################

In [7]:
# # #####################################################
# # #####  距离特征 & 统计的特征 后1000w-500w   用于validation
# # #####################################################
# feature_path = "/home/kesci/work/counting_feature/valid/"
# basic_feature_path = feature_path+"basic_feature.csv"
# train_data = "/home/kesci/input/bytedance/first-round/train.csv"
# samples = 100000000
# chunksize = 5000000
# skip_num = int(samples/chunksize) - 2
# print(skip_num)

# basic_feature_fout = open(basic_feature_path,"w")
# df = pd.read_csv(train_data, names=colnames, header=None,skiprows=chunksize*skip_num,nrows=chunksize,lineterminator="\n")
# basic_feature = ""

# for index, row in df.iterrows():
#     query_unigram = row[1].split(" ")
#     title_unigram = row[3].split(" ") 
#     # 0 长度特征 (4 +3 dim)
#     length_q = len(query_unigram)
#     length_t = len(title_unigram)
#     length_diff = len(title_unigram)-len(query_unigram)
#     length_divide = float(len(query_unigram))/len(title_unigram)
#     length_trigram = trigram_length_func(query_unigram,title_unigram)
#     # 0 LCS & substring
#     LCS_feature = get_unigram_LCS(query_unigram,title_unigram)
#     # 1 距离特征(2*3 dim)
#     trigram_distance = trigram_distance_func(query_unigram,title_unigram)
#     # 2 共现单词特征(4*3 dim)
#     trigram_intersect = trigram_intersect_func(query_unigram,title_unigram)

#     dim_hstack = np.hstack((length_q,length_t,length_diff,length_divide,
#                             LCS_feature,
#                             length_trigram,
#                             trigram_distance,
#                             trigram_intersect,
#                           ))
#     if basic_feature == "":
#         basic_feature = dim_hstack
#     else:
#         basic_feature = np.vstack((basic_feature,dim_hstack))
#     if (index+1) % 100000== 0:
#         print("write_file: ",basic_feature.shape)
#         np.savetxt(basic_feature_fout,basic_feature,delimiter=",",fmt="%.5f")
#         basic_feature = ""
    
# basic_feature_fout.flush()
# del df
# gc.collect()
# # 27 dim

18




write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file: 

103

time: 2h 59min 10s


In [6]:
# # #####################################################
# # #####  位置特征 后1000w-500w   用于validation
# # #####################################################
# feature_path = "/home/kesci/work/counting_feature/valid/"
# train_data = "/home/kesci/input/bytedance/first-round/train.csv"
# samples = 100000000
# chunksize = 5000000
# skip_num = int(samples/chunksize) - 2
# print(skip_num)

# def get_position_list(target, obs):
#     """
#         Get the list of positions of obs in target
#         属于obs & 存在于taget的单词位于obs中的位置
#         df每一行,返回一个list  (map函数)
#     """
#     target = target.split(" ")
#     obs = obs.split(" ")
#     pos_of_obs_in_target = [0]
#     if len(obs) != 0:
#         pos_of_obs_in_target = [j for j,w in enumerate(obs, start=1) if w in target]
#         if len(pos_of_obs_in_target) == 0:
#             pos_of_obs_in_target = [0]
#     return pos_of_obs_in_target

# postation_fout = open(feature_path + "postation.csv","w")
# # 1.query
# df = pd.read_csv(train_data, names=colnames, header=None,skiprows=chunksize*skip_num,nrows=chunksize,lineterminator="\n")
# pos = list(df.apply(lambda x: get_position_list(x["title"], obs=x["query"]), axis=1)) #TODO： 1 or ‘columns’: apply function to each row
# query_hstack = np.hstack((
#                 np.array(list(map(np.min, pos))).reshape(-1,1),
#                 np.array(list(map(np.mean, pos))).reshape(-1,1),
#                 np.array(list(map(np.median, pos))).reshape(-1,1),
#                 np.array(list(map(np.max, pos))).reshape(-1,1),
#                 np.array(list(map(np.std, pos))).reshape(-1,1)         
#     ))
# print("query ",query_hstack.shape)
# # 2.title
# pos = list(df.apply(lambda x: get_position_list(x["query"], obs=x["title"]), axis=1)) #TODO： 1 or ‘columns’: apply function to each row
# title_hstack = np.hstack((
#                 np.array(list(map(np.min, pos))).reshape(-1,1),
#                 np.array(list(map(np.mean, pos))).reshape(-1,1),
#                 np.array(list(map(np.median, pos))).reshape(-1,1),  #重要
#                 np.array(list(map(np.max, pos))).reshape(-1,1),
#                 np.array(list(map(np.std, pos))).reshape(-1,1)         
#     ))
# print("title ",title_hstack.shape)
# # 3.write to file
# hstack_all =  np.hstack((query_hstack,title_hstack))
# np.savetxt(postation_fout,hstack_all,delimiter=",",fmt="%.5f")
# postation_fout.flush()

18
query  (5000000, 5)
title  (5000000, 5)
time: 22min 25s


In [7]:
# #####################################################
# #####  Test测试集 
# #####################################################
# #####################################################
# #####  Test测试集 
# #####################################################
# #####################################################
# #####  Test测试集 
# #####################################################

time: 522 µs


In [8]:
# # #####################################################
# # #####  距离特征 & 统计的特征 Test测试集 
# # #####################################################
# feature_path = "/home/kesci/work/counting_feature/test/"
# basic_feature_path = feature_path+"basic_feature.csv"
# test_data = "/home/kesci/input/bytedance/first-round/test.csv"
# samples = 100000000
# chunksize = 5000000

# basic_feature_fout = open(basic_feature_path,"w")
# df = pd.read_csv(test_data, names=colnames, header=None,lineterminator="\n")
# basic_feature = ""

# for index, row in df.iterrows():
#     query_unigram = row[1].split(" ")
#     title_unigram = row[3].split(" ") 
#     # 0 长度特征 (4 +3 dim)
#     length_q = len(query_unigram)
#     length_t = len(title_unigram)
#     length_diff = len(title_unigram)-len(query_unigram)
#     length_divide = float(len(query_unigram))/len(title_unigram)
#     length_trigram = trigram_length_func(query_unigram,title_unigram)
#     # 0 LCS & substring
#     LCS_feature = get_unigram_LCS(query_unigram,title_unigram)
#     # 1 距离特征(2*3 dim)
#     trigram_distance = trigram_distance_func(query_unigram,title_unigram)
#     # 2 共现单词特征(4*3 dim)
#     trigram_intersect = trigram_intersect_func(query_unigram,title_unigram)

#     dim_hstack = np.hstack((length_q,length_t,length_diff,length_divide,
#                             LCS_feature,
#                             length_trigram,
#                             trigram_distance,
#                             trigram_intersect,
#                           ))
#     if basic_feature == "":
#         basic_feature = dim_hstack
#     else:
#         basic_feature = np.vstack((basic_feature,dim_hstack))
#     if (index+1) % 100000== 0:
#         print("write_file: ",basic_feature.shape)
#         np.savetxt(basic_feature_fout,basic_feature,delimiter=",",fmt="%.5f")
#         basic_feature = ""
    
# basic_feature_fout.flush()
# del df
# gc.collect()
# # 27 dim



write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file:  (100000, 27)
write_file: 

231

time: 3h 3min 17s


In [13]:
# # #####################################################
# # #####  位置特征  Test测试集 
# # #####################################################
# feature_path = "/home/kesci/work/counting_feature/test/"
# test_data = "/home/kesci/input/bytedance/first-round/test.csv"
# def get_position_list(target, obs):
#     """
#         Get the list of positions of obs in target
#         属于obs & 存在于taget的单词位于obs中的位置
#         df每一行,返回一个list  (map函数)
#     """
#     target = target.split(" ")
#     obs = obs.split(" ")
#     pos_of_obs_in_target = [0]
#     if len(obs) != 0:
#         pos_of_obs_in_target = [j for j,w in enumerate(obs, start=1) if w in target]
#         if len(pos_of_obs_in_target) == 0:
#             pos_of_obs_in_target = [0]
#     return pos_of_obs_in_target

# postation_fout = open(feature_path + "postation.csv","w")
# # 1.query
# df = pd.read_csv(test_data, names=colnames, header=None,lineterminator="\n")
# pos = list(df.apply(lambda x: get_position_list(x["title"], obs=x["query"]), axis=1)) #TODO： 1 or ‘columns’: apply function to each row
# query_hstack = np.hstack((
#                 np.array(list(map(np.min, pos))).reshape(-1,1),
#                 np.array(list(map(np.mean, pos))).reshape(-1,1),
#                 np.array(list(map(np.median, pos))).reshape(-1,1),
#                 np.array(list(map(np.max, pos))).reshape(-1,1),
#                 np.array(list(map(np.std, pos))).reshape(-1,1)         
#     ))
# print("query ",query_hstack.shape)
# # 2.title
# pos = list(df.apply(lambda x: get_position_list(x["query"], obs=x["title"]), axis=1)) #TODO： 1 or ‘columns’: apply function to each row
# title_hstack = np.hstack((
#                 np.array(list(map(np.min, pos))).reshape(-1,1),
#                 np.array(list(map(np.mean, pos))).reshape(-1,1),
#                 np.array(list(map(np.median, pos))).reshape(-1,1),  #重要
#                 np.array(list(map(np.max, pos))).reshape(-1,1),
#                 np.array(list(map(np.std, pos))).reshape(-1,1)         
#     ))
# print("title ",title_hstack.shape)
# # 3.write to file
# hstack_all =  np.hstack((query_hstack,title_hstack))
# np.savetxt(postation_fout,hstack_all,delimiter=",",fmt="%.5f")
# postation_fout.flush()

query  (5000000, 5)
title  (5000000, 5)
time: 41min 46s


In [7]:
# !cat /home/kesci/work/counting_feature/valid/basic_feature.csv | wc -l
# !cat /home/kesci/work/counting_feature/valid/postation.csv | wc -l
# !cat /home/kesci/work/counting_feature/train/basic_feature.csv | wc -l
# !cat /home/kesci/work/counting_feature/train/postation.csv | wc -l
# !cat /home/kesci/work/counting_feature/test/basic_feature.csv | wc -l
# !cat /home/kesci/work/counting_feature/test/postation.csv | wc -l

5000000


In [1]:
##########################################废弃代码##############################
##########################################废弃代码##############################
##########################################废弃代码##############################

In [2]:
import pandas as pd
import numpy as np
from numpy import linalg
from sklearn.model_selection import StratifiedKFold
import nltk
import pickle
from nltk import pos_tag
from scipy.sparse import hstack, vstack
from scipy import spatial
from sklearn.datasets import dump_svmlight_file,load_svmlight_file
from sklearn.decomposition import TruncatedSVD
import xgboost as xgb
import gensim
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import multiprocessing
import gc
import logging
# 显示cell运行时长
%load_ext klab-autotime

In [3]:
colnames = ["query_id","query","query_title_id","title","label"]

# # save dict
# save_path = "/home/kesci/work/word2vec/wordvectors.kv"
# w2vmodel_test = pickle.load(open(model_path+"word2vec_model_new_withTest.pkl","rb"))
# w2vmodel_test.wv.save(save_path)
save_path = "/home/kesci/work/word2vec/wordvectors.kv"
w2v_dict = KeyedVectors.load(save_path, mmap='r')
len(w2v_dict["1427"])

128

time: 3.95 s


In [None]:
############################## avg word2vec 表征 sentence vector ##############
############################## avg word2vec 表征 sentence vector ##############
############################## avg word2vec 表征 sentence vector ##############

In [4]:
def cosine_sim(query, title):
    l2_query = linalg.norm(query,ord=2,axis=1)
    l2_title = linalg.norm(title,ord=2,axis=1)
    eudistance = linalg.norm(query-title,ord=2,axis=1) # 欧式距离 L2
    manhattan = linalg.norm(query-title,ord=1,axis=1) # 曼哈顿距离 L1
    chebyshev = linalg.norm(query-title,ord=np.inf,axis=1) # 切比雪夫距离 Max|x1-x2|
    chebyshev_ = linalg.norm(query-title,ord=-np.inf,axis=1) # Min |x1-x2|
    my_sqrt_distance = np.sum(np.sqrt(np.absolute(query-title)),axis=1) # sqrt|x1-x2|
    sim = np.sum(query * title,axis=1)/(l2_query * l2_title)
    
    combine_sim = np.hstack((sim.reshape(-1,1),
                             eudistance.reshape(-1,1),
                             manhattan.reshape(-1,1),
                             chebyshev.reshape(-1,1),chebyshev_.reshape(-1,1),
                             my_sqrt_distance.reshape(-1,1)
                            ))
    return combine_sim

#### 1w大小的数据  计算一次
def concat_sentence(origin_data):
    q_vstack = ""
    t_vstack = ""
    for row in origin_data.values:
        # 1.query
        words = row[1].split(" ")
        M = []
        for w in words:
            try:
                M.append(w2v_dict[w])
            except:
                continue
        M = np.array(M)
        if M.shape[0] != 0:
            q_sentence_vec = M.sum(axis=0) / M.shape[0]
        else:
            q_sentence_vec = np.full(word_dim,np.finfo(np.float32).eps)
        if q_vstack == "":
            q_vstack = q_sentence_vec
        else:
             q_vstack = np.vstack((q_vstack,q_sentence_vec))
        
        # 2.title
        words = row[3].split(" ")
        M = []
        for w in words:
            try:
                M.append(w2v_dict[w])
            except:
                continue
        M = np.array(M)
        if M.shape[0] != 0:
            t_sentence_vec = M.sum(axis=0) / M.shape[0]
        else:
            t_sentence_vec = np.full(word_dim,np.finfo(np.float32).eps)
        if t_vstack == "":
            t_vstack = t_sentence_vec
        else:
             t_vstack = np.vstack((t_vstack,t_sentence_vec))
    return q_vstack,t_vstack

time: 6.09 ms


In [7]:
############## 测试集数据 ############
feature_path = "/home/kesci/work/similarity_feature/test/"
test_data = "/home/kesci/input/bytedance/first-round/test.csv"
# sentece vector of (query-title)
sentence_vector_path = feature_path+"sentence_vector.csv"
sentence_vec_fout = open(sentence_vector_path,"w")
# 6dim similarity feature
consine_feature_path = feature_path+"consine_sentence_sim.csv"
sim_qt_fout = open(consine_feature_path,"w")

chunksize = 10000

for i in range(int(500)):
    df = pd.read_csv(test_data, names=colnames, header=None,skiprows=chunksize*i,nrows=chunksize,lineterminator="\n")
    # 1.get sentece_vector & consine sim
    q_vstack,t_vstack = concat_sentence(df)
    combine_sim = cosine_sim(q_vstack,t_vstack)
    # 2.save
    np.savetxt(sentence_vec_fout,q_vstack - t_vstack,delimiter=",",fmt="%.5f")
    np.savetxt(sim_qt_fout,combine_sim,delimiter=",",fmt="%.5f")

    # 3.print log
    if (i+1) % 100 == 0:
        print((q_vstack - t_vstack).shape,combine_sim.shape)
    # # 4.gc
    # del df,q_vstack,t_vstack
    # gc.collect()

sentence_vec_fout.flush()
sim_qt_fout.flush()




(10000, 128) (10000, 6)
(10000, 128) (10000, 6)
(10000, 128) (10000, 6)
(10000, 128) (10000, 6)
(10000, 128) (10000, 6)
time: 1h 9min 59s


In [8]:
!cat /home/kesci/work/similarity_feature/test/consine_sentence_sim.csv | wc -l
!cat /home/kesci/work/similarity_feature/test/sentence_vector.csv | wc -l

5000000
5000000
time: 6.85 s


In [11]:
##################################################
##########计算word 和title_sentence sim
##################################################
word_dim = 128
from scipy import spatial
def cosine_sim(A, B):
    return 1 - spatial.distance.cosine(A, B)

def word_sentence_sim(origin_data):
    sim_vstack = ""
    for row in origin_data.values:        
        # 1. title sentence vector
        words = nltk.SpaceTokenizer().tokenize(row[3])
        t_sentence_vec = np.full(word_dim,np.finfo(np.float32).eps)
        M = []
        for w in words:
            try:
                M.append(w2v_dict[w])
            except:
                continue
        M = np.array(M)
        if M.shape[0] != 0:
            t_sentence_vec = M.sum(axis=0) / M.shape[0]
        
        # 2.get max,min,avg,median,std
        words = nltk.SpaceTokenizer().tokenize(row[1])
        M = []
        for w in words:
            try:
                A = w2v_dict[w]
                M.append(cosine_sim(A,t_sentence_vec))
            except:
                continue
        M = np.array(M)
        if M.shape[0] == 0:
            M = np.zeros(word_dim)
        sim_hstack = np.hstack(( 
                            np.min(M),np.max(M),
                            np.average(M),np.median(M),np.std(M)
                               ))         
        if sim_vstack == "":
            sim_vstack = sim_hstack.reshape(1,-1)
        else:
            sim_vstack = np.vstack((sim_vstack,sim_hstack.reshape(1,-1)))
    return sim_vstack

time: 3.4 ms


In [14]:
############## 1000w-500w 验证集数据 ############
feature_path = "/home/kesci/work/similarity_feature/valid/"
train_data = "/home/kesci/input/bytedance/first-round/train.csv"
# # 5dim word_similarity feature
consine_feature_path = feature_path+"consine_word_sim.csv"
sim_qt_fout = open(consine_feature_path,"w")

samples = 100000000
chunksize = 10000
skip_num = int(samples/chunksize) - 1000
print(skip_num)

for i in range(int(500)):
    df = pd.read_csv(train_data, names=colnames, header=None,skiprows=chunksize*i,nrows=chunksize,lineterminator="\n")
    combine_sim = word_sentence_sim(df)
    np.savetxt(sim_qt_fout,combine_sim,delimiter=",",fmt="%.5f") 
    # print log
    if (i+1) % 100 == 0:
        print(combine_sim.shape)

sim_qt_fout.flush()


9000




(10000, 5)
(10000, 5)
(10000, 5)
(10000, 5)
(10000, 5)
time: 1h 8min 9s


In [12]:
############## 测试集数据 ############
feature_path = "/home/kesci/work/similarity_feature/test/"
test_data = "/home/kesci/input/bytedance/first-round/test.csv"
# 5dim word_similarity feature
consine_feature_path = feature_path+"consine_word_sim.csv"
sim_qt_fout = open(consine_feature_path,"w")

chunksize = 10000

for i in range(int(500)):
    df = pd.read_csv(test_data, names=colnames, header=None,skiprows=chunksize*i,nrows=chunksize,lineterminator="\n")
    combine_sim = word_sentence_sim(df)
    np.savetxt(sim_qt_fout,combine_sim,delimiter=",",fmt="%.5f") 
    # print log
    if (i+1) % 100 == 0:
        print(combine_sim.shape)

sim_qt_fout.flush()




(10000, 5)
(10000, 5)
(10000, 5)
(10000, 5)
(10000, 5)
time: 1h 6min 28s


In [None]:
##############################  点击量特征 2 dim ####################
##############################  点击量特征 2 dim ####################
##############################  点击量特征 2 dim ####################
##############################  点击量特征 2 dim ####################
##############################  点击量特征 2 dim ####################
##############################  点击量特征 2 dim ####################

In [3]:
#### 训练集分别对于query_id,title保存点击量特征
feature_path = "/home/kesci/work/counting_feature/train/"
train_data = "/home/kesci/input/bytedance/first-round/train.csv"
chunksize = 10000
queryID_dict = dict()
for df in pd.read_csv(train_data, names=colnames, header=None,chunksize=chunksize,lineterminator="\n"):
    for row in df.values:
        qid = row[0]
        queryID_dict[qid] = queryID_dict.get(qid,0) + 1

# # 训练集title点击量特征
# title_dict = dict()
# for df in pd.read_csv(train_data, names=colnames, header=None,chunksize=chunksize,lineterminator="\n"):
#     for row in df.values:
#         title = row[3]
#         title_dict[title] = title_dict.get(title,0) + 1


# 训练集title:query_list(5dim) 特征   
print(len(queryID_dict.keys()))
title_dict = dict()
for df in pd.read_csv(train_data, names=colnames, header=None,chunksize=chunksize,lineterminator="\n"):
    for row in df.values:
        qid = row[0]
        title = row[3]
        title_dict[title] = title_dict.get(title,"")+ str(queryID_dict[qid]) + ","
del queryID_dict
gc.collect()        

time: 4min 5s


In [8]:
# print(len(queryID_dict.keys()))
print(len(title_dict.keys()))

28934366
time: 572 µs


In [15]:
# ############################################# 06.17   5dim     title:query_list 
# ############################################# 06.17   5dim     title:query_list 
# ############################################# 06.17   5dim     title:query_list 
# ############################################# 06.17   5dim     title:query_list 
# ############################################# 06.17   5dim     title:query_list 
# def get_5dim_intersect(q_list):
#     query_np = np.array(q_list)
#     return [np.min(query_np),
#             np.max(query_np),
#             np.median(query_np),
#             np.mean(query_np),
#             np.std(query_np),
#             ]
# samples = 100000000
# chunksize = 5000000
# # 训练集 (500w)
# feature_path = "/home/kesci/work/counting_feature/train/"
# train_data = "/home/kesci/input/bytedance/first-round/train.csv"
# skip_num = int(samples/chunksize) - 1
# print(skip_num)
# title_path = feature_path + "click_num_t_groupby_5dim.csv"
# title_feature_fout = open(title_path,"w")
# df = pd.read_csv(train_data, names=colnames, header=None,skiprows=chunksize*skip_num,nrows=chunksize,lineterminator="\n")
# for index, row in df.iterrows():
#     qclick_list_str = title_dict[row[3]].split(",")[:-1]
#     qclick_list_int = [ int(x) for x in qclick_list_str ]
#     query_list_5dim = np.array(get_5dim_intersect(qclick_list_int)).reshape(1,-1)
#     np.savetxt(title_feature_fout,query_list_5dim,delimiter=",",fmt="%.5f")
#     if index % 1000000 == 0:
#         print(query_list_5dim.shape)
# title_feature_fout.flush()

# # 验证集 (1000w - 500w)
# feature_path = "/home/kesci/work/counting_feature/valid/"
# train_data = "/home/kesci/input/bytedance/first-round/train.csv"
# skip_num = int(samples/chunksize) - 2
# print(skip_num)
# title_path = feature_path + "click_num_t_groupby_5dim.csv"
# title_feature_fout = open(title_path,"w")
# df = pd.read_csv(train_data, names=colnames, header=None,skiprows=chunksize*skip_num,nrows=chunksize,lineterminator="\n")
# for index, row in df.iterrows():
#     qclick_list_str = title_dict[row[3]].split(",")[:-1]
#     qclick_list_int = [ int(x) for x in qclick_list_str ]
#     query_list_5dim = np.array(get_5dim_intersect(qclick_list_int)).reshape(1,-1)
#     np.savetxt(title_feature_fout,query_list_5dim,delimiter=",",fmt="%.5f")
#     if index % 1000000 == 0:
#         print(query_list_5dim.shape)
# title_feature_fout.flush()


19
(1, 5)
(1, 5)
(1, 5)
(1, 5)
(1, 5)
time: 36min 50s


In [9]:
###################################################### 2dim to file
###################################################### 2dim to file
###################################################### 2dim to file
# # merge train
# skip_num = int(samples/chunksize) - 1
# print(skip_num)
# merge_q_fout = open(feature_path+"click_num_q.csv","w")
# merge_t_fout = open(feature_path+"click_num_t.csv","w")
# df = pd.read_csv(train_data, names=colnames, header=None,skiprows=chunksize*skip_num,nrows=chunksize,lineterminator="\n")
# # 3:4,1:2.3
# for index, row in df.iterrows():
#     q_count = queryID_dict[row[0]]
#     t_count = float(title_dict[row[3]])
#     merge_q_fout.write(str(q_count)+"\n")
#     merge_t_fout.write(str(t_count)+"\n")
#     if(index % 1000000 == 0):
#         print(index)
# merge_q_fout.flush()    
# merge_t_fout.flush()
# del df
# gc.collect()

19
0
1000000
2000000
3000000
4000000


8

time: 16min 1s


In [10]:
# # merge valid
# feature_path = "/home/kesci/work/counting_feature/valid/"
# skip_num = int(samples/chunksize) - 2
# print(skip_num)
# merge_q_fout = open(feature_path+"click_num_q.csv","w")
# merge_t_fout = open(feature_path+"click_num_t.csv","w")
# df = pd.read_csv(train_data, names=colnames, header=None,skiprows=chunksize*skip_num,nrows=chunksize,lineterminator="\n")
# # 3:4,1:2.3
# for index, row in df.iterrows():
#     q_count = queryID_dict[row[0]]
#     t_count = float(title_dict[row[3]])
#     merge_q_fout.write(str(q_count)+"\n")
#     merge_t_fout.write(str(t_count)+"\n")
#     if(index % 1000000 == 0):
#         print(index)
# merge_q_fout.flush()    
# merge_t_fout.flush()
# del df
# gc.collect()

18
0
1000000
2000000
3000000
4000000


7

time: 16min 15s


In [17]:
# ####################################################点击量特征 Test 集
# ####################################################点击量特征 Test 集
# ####################################################点击量特征 Test 集
# ####################################################点击量特征 Test 集
# ####################################################点击量特征 Test 集
### 测试集分别对于query_id,title保存点击量特征
query_dict_test = dict()
test_data = "/home/kesci/input/bytedance/first-round/test.csv"
for df in pd.read_csv(test_data, names=colnames, header=None,chunksize=chunksize,lineterminator="\n"):
    for row in df.values:
        title = row[0]
        query_dict_test[title] = query_dict_test.get(title,0) + 1

# # title_dict_test = dict()
# # test_data = "/home/kesci/input/bytedance/first-round/test.csv"
# # for df in pd.read_csv(test_data, names=colnames, header=None,chunksize=chunksize,lineterminator="\n"):
# #     for row in df.values:
# #         title = row[3]
# #         title_dict_test[title] = title_dict_test.get(title,0) + 1
        
# 测试集 title:query_list(5dim) 特征   
title_dict_test = dict()
for df in pd.read_csv(test_data, names=colnames, header=None,chunksize=chunksize,lineterminator="\n"):
    for row in df.values:
        qid = row[0]
        title = row[3]
        title_dict_test[title] = title_dict_test.get(title,"")+ str(query_dict_test[qid]) + ","
del query_dict_test
gc.collect()

print(len(title_dict_test))

18

time: 33.1 s


In [None]:
############################################# 06.17   5dim     title:query_list 
############################################# 06.17   5dim     title:query_list 
# 测试集
feature_path = "/home/kesci/work/counting_feature/test/"
test_data = "/home/kesci/input/bytedance/first-round/test.csv"
title_path = feature_path + "click_num_t_groupby_5dim.csv"
title_feature_fout = open(title_path,"w")
df = pd.read_csv(test_data, names=colnames, header=None,lineterminator="\n")
for index, row in df.iterrows():
    qclick_list_str = title_dict_test[row[3]].split(",")[:-1]
    qclick_list_int = [ int(x) for x in qclick_list_str ]
    query_list_5dim = np.array(get_5dim_intersect(qclick_list_int)).reshape(1,-1)
    np.savetxt(title_feature_fout,query_list_5dim,delimiter=",",fmt="%.5f")
    if index % 1000000 == 0:
        print(query_list_5dim.shape)
title_feature_fout.flush()

(1, 5)
(1, 5)
(1, 5)
(1, 5)
(1, 5)
time: 35min 44s


In [None]:
####################################################################### 2dim 测试集
####################################################################### 2dim 测试集
####################################################################### 2dim 测试集
# feature_path = "/home/kesci/work/counting_feature/test/"
# test_data = "/home/kesci/input/bytedance/first-round/test.csv"
# chunksize = 10000
# title_dict_test = dict()
# for df in pd.read_csv(test_data, names=colnames, header=None,chunksize=chunksize,lineterminator="\n"):
#     for row in df.values:
#         title = row[3]
#         title_dict_test[title] = title_dict_test.get(title,0) + 1
# query_dict_test = dict()
# for df in pd.read_csv(test_data, names=colnames, header=None,chunksize=chunksize,lineterminator="\n"):
#     for row in df.values:
#         title = row[0]
#         query_dict_test[title] = query_dict_test.get(title,0) + 1

# # merge test
# merge_q_fout = open(feature_path+"click_num_q.csv","w")
# merge_t_fout = open(feature_path+"click_num_t.csv","w")
# df = pd.read_csv(test_data, names=colnames, header=None,lineterminator="\n")
# # 3:4,1:2.3  (test title 不作处理)
# for index, row in df.iterrows():
#     q_count = query_dict_test[row[0]]
#     t_count = title_dict_test[row[3]]
#     merge_q_fout.write(str(q_count)+"\n")
#     merge_t_fout.write(str(t_count)+"\n")
#     if (index % 1000000 == 0):
#         print(index)
# merge_q_fout.flush()    
# merge_t_fout.flush()
# del df
# gc.collect()

In [None]:
##############################  点击量特征 2+5+5 dim (06.17新版本)####################
##############################  点击量特征 2+5+5 dim (06.17新版本)####################
##############################  点击量特征 2+5+5 dim (06.17新版本)####################
##############################  点击量特征 2+5+5 dim (06.17新版本)####################
##############################  点击量特征 2+5+5 dim (06.17新版本)####################
##############################  点击量特征 2+5+5 dim (06.17新版本)####################
##############################  点击量特征 2+5+5 dim (06.17新版本)####################

In [9]:
#################################### train ##########################
#################################### train ##########################
#################################### train ##########################
feature_path = "/home/kesci/work/counting_feature/train/"
train_data = "/home/kesci/input/bytedance/first-round/train.csv"
#已经计算完毕的 query,title的点击量特征
click_query_path = feature_path + "click_num_q.csv" 
click_title_path = feature_path + "click_num_t.csv"
click_title_5dim_path = feature_path + "click_num_t_groupby_5dim.csv"
df_query = pd.read_csv(click_query_path, header=None,lineterminator="\n")
df_title = pd.read_csv(click_title_path, names=["title_num"], header=None,lineterminator="\n")
df_title_groupby = pd.read_csv(click_title_5dim_path, header=None,lineterminator="\n")
# combine 点击量特征
click_combine_path = feature_path + "click_num_12dim.csv"

samples = 100000000
chunksize = 5000000
skip_num = int(samples/chunksize) - 1
print(skip_num)

df_qid = pd.read_csv(train_data, names=colnames, header=None,
                skiprows=chunksize*skip_num,nrows=chunksize,lineterminator="\n")["query_id"]
df = pd.concat([df_qid, df_title], axis=1)
# 已保存的文件
print(df_qid.shape,df_query.shape,df_title.shape,df_title_groupby.shape)
# concat 新文件
print(df.shape)
df.head(5)

19
(5000000,) (5000000, 1) (5000000, 1) (5000000, 5)
(5000000, 2)


Unnamed: 0,query_id,title_num
0,14315375,3.0
1,14315375,9.0
2,14315375,1.0
3,14315375,4.0
4,14315376,6.0


time: 38.4 s


In [10]:
click_combine_fout = open(click_combine_path,"w")
aggregation={
        "MIN": lambda x: x.min(skipna=True),
        "MAX":lambda x: x.max(skipna=True),
        "MEDIAN":lambda x: x.median(skipna=True),
        "MEAN":lambda x:x.mean(skipna=True),
        "STD":lambda x:x.std(skipna=True),
}
df_tmp = df.groupby("query_id", sort=False)["title_num"].agg(aggregation).reset_index()
new_df = df.merge(df_tmp,on=['query_id'])[['MIN', 'MAX', 'MEDIAN', 'MEAN', 'STD']]
hstack_all = np.hstack((
                    np.array(df_query).reshape(-1,1), # query 点击量
                    np.array(df_title).reshape(-1,1), # title 点击量
                    np.array(df_title_groupby),
                    np.array(new_df),
                    ))
print(hstack_all.shape)
np.savetxt(click_combine_fout,hstack_all,delimiter=",",fmt="%.5f")
click_combine_fout.flush()

is deprecated and will be removed in a future version
  if __name__ == '__main__':


(5000000, 12)
time: 10min 50s


In [11]:
#################################### valid ##########################
#################################### valid ##########################
#################################### valid ##########################
feature_path = "/home/kesci/work/counting_feature/valid/"
train_data = "/home/kesci/input/bytedance/first-round/train.csv"
#已经计算完毕的 query,title的点击量特征
click_query_path = feature_path + "click_num_q.csv" 
click_title_path = feature_path + "click_num_t.csv"
click_title_5dim_path = feature_path + "click_num_t_groupby_5dim.csv"
df_query = pd.read_csv(click_query_path, header=None,lineterminator="\n")
df_title = pd.read_csv(click_title_path, names=["title_num"], header=None,lineterminator="\n")
df_title_groupby = pd.read_csv(click_title_5dim_path, header=None,lineterminator="\n")
# combine 点击量特征
click_combine_path = feature_path + "click_num_12dim.csv"

samples = 100000000
chunksize = 5000000
skip_num = int(samples/chunksize) - 2
print(skip_num)

df_qid = pd.read_csv(train_data, names=colnames, header=None,
                skiprows=chunksize*skip_num,nrows=chunksize,lineterminator="\n")["query_id"]
df = pd.concat([df_qid, df_title], axis=1)
# 已保存的文件
print(df_qid.shape,df_query.shape,df_title.shape,df_title_groupby.shape)
# concat 新文件
print(df.shape)
df.head(5)

18
(5000000,) (5000000, 1) (5000000, 1) (5000000, 5)
(5000000, 2)


Unnamed: 0,query_id,title_num
0,13556849,45.0
1,13556849,8.0
2,13556849,2.0
3,13556849,5.0
4,13556849,27.0


time: 37.3 s


In [12]:
click_combine_fout = open(click_combine_path,"w")
aggregation={
        "MIN": lambda x: x.min(skipna=True),
        "MAX":lambda x: x.max(skipna=True),
        "MEDIAN":lambda x: x.median(skipna=True),
        "MEAN":lambda x:x.mean(skipna=True),
        "STD":lambda x:x.std(skipna=True),
}
df_tmp = df.groupby("query_id", sort=False)["title_num"].agg(aggregation).reset_index()
new_df = df.merge(df_tmp,on=['query_id'])[['MIN', 'MAX', 'MEDIAN', 'MEAN', 'STD']]
hstack_all = np.hstack((
                    np.array(df_query).reshape(-1,1),
                    np.array(df_title).reshape(-1,1),
                    np.array(df_title_groupby),
                    np.array(new_df),
                    ))
print(hstack_all.shape)
np.savetxt(click_combine_fout,hstack_all,delimiter=",",fmt="%.5f")
click_combine_fout.flush()

is deprecated and will be removed in a future version
  if __name__ == '__main__':


(5000000, 12)
time: 10min 50s


In [15]:
#################################### test ##########################
#################################### test ##########################
#################################### test ##########################
feature_path = "/home/kesci/work/counting_feature/test/"
test_data = "/home/kesci/input/bytedance/first-round/test.csv"
#已经计算完毕的 query,title的点击量特征
click_query_path = feature_path + "click_num_q.csv" 
click_title_path = feature_path + "click_num_t.csv"
click_title_5dim_path = feature_path + "click_num_t_groupby_5dim.csv"
df_query = pd.read_csv(click_query_path, header=None,lineterminator="\n")
df_title = pd.read_csv(click_title_path, names=["title_num"], header=None,lineterminator="\n")
df_title_groupby = pd.read_csv(click_title_5dim_path,header=None,lineterminator="\n")
# combine 点击量特征
click_combine_path = feature_path + "click_num_12dim.csv"

df_qid = pd.read_csv(test_data, names=colnames, header=None,lineterminator="\n")["query_id"]
df = pd.concat([df_qid, df_title], axis=1)

# 已保存的文件
print(df_qid.shape,df_query.shape,df_title.shape,df_title_groupby.shape)
# concat 新文件
print(df.shape)
df.head(5)


(5000000,) (5000000, 1) (5000000, 1) (5000000, 5)
(5000000, 2)


Unnamed: 0,query_id,title_num
0,1,1
1,1,1
2,1,1
3,1,1
4,2,4


time: 13.4 s


In [16]:
click_combine_fout = open(click_combine_path,"w")
aggregation={
        "MIN": lambda x: x.min(skipna=True),
        "MAX":lambda x: x.max(skipna=True),
        "MEDIAN":lambda x: x.median(skipna=True),
        "MEAN":lambda x:x.mean(skipna=True),
        "STD":lambda x:x.std(skipna=True)
}
df_tmp = df.groupby("query_id", sort=False)["title_num"].agg(aggregation).reset_index()
new_df = df.merge(df_tmp,on=['query_id'])[['MIN', 'MAX', 'MEDIAN', 'MEAN', 'STD']]
hstack_all = np.hstack((
                    np.array(df_query).reshape(-1,1),
                    np.array(df_title).reshape(-1,1),
                    np.array(df_title_groupby),
                    np.array(new_df),
                    ))
print(hstack_all.shape)
np.savetxt(click_combine_fout,hstack_all,delimiter=",",fmt="%.5f")
click_combine_fout.flush()

is deprecated and will be removed in a future version
  if __name__ == '__main__':


(5000000, 12)
time: 35min 29s


In [None]:
########################### Rank 余弦相似度特征 ################
########################### Rank 余弦相似度特征 ################
########################### Rank 余弦相似度特征 ################

In [11]:
#################################### train ##########################
feature_path = "/home/kesci/work/similarity_feature/train/"
train_data = "/home/kesci/input/bytedance/first-round/train.csv"
consine_feature_path = feature_path + "consine_sentence_sim.csv"
rank_path = feature_path + "rank_consine.csv"

samples = 100000000
chunksize = 5000000
skip_num = int(samples/chunksize) - 1
print(skip_num)

consine_names = ["consine","2","3","4","5","6"]
consine_df = pd.read_csv(consine_feature_path,names=consine_names,header=None,lineterminator="\n")["consine"]
df_qid = pd.read_csv(train_data, names=colnames, header=None,
                skiprows=chunksize*skip_num,nrows=chunksize,lineterminator="\n")["query_id"]
df_combine = pd.concat([df_qid, consine_df], axis=1)
print(df_combine.shape)
print(df_combine.head(5))
df_combine['Auction_Rank'] = df_combine.groupby('query_id')['consine'].rank(ascending=False)
df_combine.to_csv(feature_path+"rank_consine.csv",columns=["Auction_Rank"],header=None,index=None)

19
(5000000, 2)
   query_id  consine
0  14315375  0.76487
1  14315375  0.83777
2  14315375  0.83581
3  14315375  0.84777
4  14315376  0.99269
time: 52.2 s


In [12]:
#################################### valid ##########################
feature_path = "/home/kesci/work/similarity_feature/valid/"
train_data = "/home/kesci/input/bytedance/first-round/train.csv"
consine_feature_path = feature_path + "consine_sentence_sim.csv"
rank_path = feature_path + "rank_consine.csv"

samples = 100000000
chunksize = 5000000
skip_num = int(samples/chunksize) - 2
print(skip_num)

consine_names = ["consine","2","3","4","5","6"]
consine_df = pd.read_csv(consine_feature_path,names=consine_names,header=None,lineterminator="\n")["consine"]
df_qid = pd.read_csv(train_data, names=colnames, header=None,
                skiprows=chunksize*skip_num,nrows=chunksize,lineterminator="\n")["query_id"]
df_combine = pd.concat([df_qid, consine_df], axis=1)
print(df_combine.shape)
print(df_combine.head(5))
df_combine['Auction_Rank'] = df_combine.groupby('query_id')['consine'].rank(ascending=False)
df_combine.to_csv(feature_path+"rank_consine.csv",columns=["Auction_Rank"],header=None,index=None)

18
(5000000, 2)
   query_id  consine
0  13556849  0.70962
1  13556849  0.71586
2  13556849  0.75645
3  13556849  0.74697
4  13556849  0.92063
time: 53.7 s


In [13]:
#################################### test ##########################
feature_path = "/home/kesci/work/similarity_feature/test/"
test_data = "/home/kesci/input/bytedance/first-round/test.csv"
consine_feature_path = feature_path + "consine_sentence_sim.csv"
rank_path = feature_path + "rank_consine.csv"

consine_names = ["consine","2","3","4","5","6"]
consine_df = pd.read_csv(consine_feature_path,names=consine_names,header=None,lineterminator="\n")["consine"]
df_qid = pd.read_csv(test_data, names=colnames, header=None,lineterminator="\n")["query_id"]
df_combine = pd.concat([df_qid, consine_df], axis=1)
print(df_combine.shape)
print(df_combine.head(5))
df_combine['Auction_Rank'] = df_combine.groupby('query_id')['consine'].rank(ascending=False)
df_combine.to_csv(feature_path+"rank_consine.csv",columns=["Auction_Rank"],header=None,index=None)

(5000000, 2)
   query_id  consine
0         1  0.80942
1         1  0.80083
2         1  0.76771
3         1  0.77930
4         2  0.77215
time: 28.8 s


In [18]:
# !cat /home/kesci/work/similarity_feature/train/rank_consine.csv | head -n 10
# !cat /home/kesci/work/similarity_feature/valid/rank_consine.csv | head -n 10
# !cat /home/kesci/work/similarity_feature/test/rank_consine.csv | head -n 10

time: 845 µs


In [17]:
!cat /home/kesci/work/counting_feature/train/click_num_12dim.csv | wc -l
!cat /home/kesci/work/counting_feature/valid/click_num_12dim.csv | wc -l
!cat /home/kesci/work/counting_feature/test/click_num_12dim.csv | wc -l

!cat /home/kesci/work/counting_feature/train/click_num_12dim.csv | tail -n 1
!cat /home/kesci/work/counting_feature/valid/click_num_12dim.csv | tail -n 1
!cat /home/kesci/work/counting_feature/test/click_num_12dim.csv | tail -n 1

5000000
5000000
5000000
8.00000,4.00000,8.00000,12.00000,10.50000,10.25000,1.47902,2.00000,7.00000,4.00000,4.12500,1.55265
20.00000,1.00000,20.00000,20.00000,20.00000,20.00000,0.00000,1.00000,64.00000,4.00000,11.00000,16.13691
8.00000,1.00000,8.00000,8.00000,8.00000,8.00000,0.00000,1.00000,3.00000,1.00000,1.25000,0.70711
time: 41.8 s


In [9]:
!cat /home/kesci/work/counting_feature/train/basic_feature.csv | wc -l
!cat /home/kesci/work/counting_feature/valid/basic_feature.csv | wc -l
!cat /home/kesci/work/counting_feature/test/basic_feature.csv | wc -l

5000000
5000000
5000000
time: 5.22 s
