## 构造特征

构造user_pop,item_pop,user_rate,item_rate,lfm_reco

In [1]:
from sklearn.tree import DecisionTreeClassifier
from lightgbm.sklearn import LGBMClassifier
import pandas as pd
import numpy as np
import pickle

In [2]:
dpath = "./data/"

### 1.构造lfm_reco

In [3]:
def lfm_train(train_data, F, alpha, beta, step):
    """
    train LFM model,get latent factor user_vec and item_vec
    Args:
        train_data: train_data for lfm
        F: user vector len, item vector len
        alpha:regularization factor
        beta: learning rate
        step: iteration number
    Return:
        dict: key itemid, value:np.ndarray
        dict: key userid, value:np.ndarray
    """
    user_vec = {}
    item_vec = {}
    count = 0
    for step in range(step):
        fin = open(dpath+train_data,"r+")
        start = 0
        for line in fin:
            if start == 0:
                start += 1
                continue
            cols = line.strip().split(",")
            userid,itemid,target = cols[0],cols[1],cols[-1]
            if userid not in user_vec:
                user_vec[userid] = np.random.randn(F)
            if itemid not in item_vec:
                item_vec[itemid] = np.random.randn(F)
            #target是str，需转换为int
            delta = int(target)-lfm_score(user_vec[userid],item_vec[itemid])
            for i in range(F):
                user_vec[userid][i] += beta*(delta*item_vec[itemid][i]\
                                            -alpha*user_vec[userid][i])
                item_vec[itemid][i] += beta*(delta*user_vec[userid][i]\
                                            -alpha*item_vec[itemid][i])
            count += 1
            #每100000个样本更新一次学习率
            if count%100000==0:
                beta *= 0.95
    pickle.dump(user_vec,open(dpath+"user_vec.pkl","wb"))
    pickle.dump(item_vec,open(dpath+"item_vec.pkl","wb"))

In [4]:
def lfm_score(user_vector,item_vector):
    """
    user_vector and item_vector distance
    Args:
        user_vector: lfm model produce user vector
        item_vector: lfm model produce item vector
    Return:
         lfm recommend score
    """
    res = np.dot(user_vector, item_vector)/\
                (np.linalg.norm(user_vector)*np.linalg.norm(item_vector))
    return res

In [5]:
lfm_train("train_merge.csv", 30, 0.01, 0.1, 10)  

### 2.构造user_pop和item_pop，user_rate和item_rate  
user_record = {userid: [value1,value2],...}  
item_record = {itemid: [value1,value2],...}

In [8]:
def get_record_file(input_file):
    """
    get user_record and item_record dict
    """
    fin = open(dpath+input_file,"r+")
    user_record = {}
    item_record = {}
    start = 0
    for line in fin:
        if start == 0:
            start += 1
            continue
        cols = line.strip().split(",")
        userid,itemid,target = cols[0],cols[1],cols[-1]
        if userid not in user_record:
            user_record[userid] = [0,0]
        user_record[userid][0] += 1
        #TypeError: unsupported operand type(s) for +=: 'int' and 'str'  
        #int(target)
        user_record[userid][1] += int(target)
        if itemid not in item_record:
            item_record[itemid] = [0,0]
        item_record[itemid][0] += 1
        item_record[itemid][1] += int(target)
    pickle.dump(user_record,open(dpath+"user_record.pkl","wb"))
    pickle.dump(item_record,open(dpath+"item_record.pkl","wb"))

In [None]:
get_record_file("train_merge.csv")

In [None]:
user_record = pickle.load(open(dpath+"user_record.pkl","rb"))
item_record = pickle.load(open(dpath+"item_record.pkl","rb"))

In [None]:
len(user_record),len(item_record)

In [None]:
user_record

In [9]:
def get_mean_std(user_record,item_record):
    """
    get user_pop mean and std, get item_pop mean and std
    """
    user_pop_mean = np.mean(list(map(lambda x:x[1],user_record.values())))
    user_pop_std = np.std(list(map(lambda x:x[1],user_record.values())))
    
    item_pop_mean = np.mean(list(map(lambda x:x[1],item_record.values())))
    item_pop_std = np.std(list(map(lambda x:x[1],item_record.values())))
    
    return user_pop_mean,user_pop_std,item_pop_mean,item_pop_std

In [None]:
user_pop_mean,user_pop_std,item_pop_mean,item_pop_std = get_mean_std(user_record,item_record)

In [None]:
user_pop_mean,user_pop_std,item_pop_mean,item_pop_std

### 3.把user_pop,item_pop,user_rate,item_rate,lfm_reco写入文件保存，生成最终训练文件

user_rate和item_rate取值为何一样  
outcols item_rate写成user_rate

In [10]:
def generate_train_final(input_file,output_file):
    """
    generate final train file
    """
    fin = open(dpath+input_file,"r+")
    fout = open(dpath+output_file,"w+")
    user_vec = pickle.load(open(dpath+"user_vec.pkl","rb"))
    item_vec = pickle.load(open(dpath+"item_vec.pkl","rb"))
    user_record = pickle.load(open(dpath+"user_record.pkl","rb"))
    item_record = pickle.load(open(dpath+"item_record.pkl","rb"))
    user_pop_mean,user_pop_std,item_pop_mean,item_pop_std\
                    = get_mean_std(user_record,item_record)
    start = 0
    lfm_reco = 0
    outcols = []
    for line in fin:
        cols = line.strip().split(",")
        #写入column name
        if start == 0:
            outcols = cols[:-1]+["user_pop","item_pop","user_rate","item_rate","lfm_reco"]+[cols[-1]]
            fout.write(",".join(outcols)+"\n")
            start += 1
            continue
        userid,itemid = cols[0],cols[1]
        #计算user_pop，item_pop
        user_pop = round((user_record[userid][1]-user_pop_mean)/user_pop_std,3)
        item_pop = round((item_record[itemid][1]-item_pop_mean)/item_pop_std,3)
        #计算user_rate，item_rate
        user_rate = round(user_record[userid][1]/user_record[userid][0],3)
        item_rate = round(item_record[itemid][1]/item_record[itemid][0],3)
        #计算lfm_reco
        if cols[0] in user_vec and cols[1] in item_vec:
            lfm_reco = lfm_score(user_vec[cols[0]],item_vec[cols[1]])
            lfm_reco = np.around(lfm_reco,decimals=5)
            outcols = cols[:-1]+[str(user_pop)]+[str(item_pop)]+\
                      [str(user_rate)]+[str(item_rate)]+[str(lfm_reco)]+[cols[-1]]
        else:
            continue
        #写入文件
        fout.write(",".join(outcols)+"\n")
    fin.close()
    fout.close()

In [11]:
generate_train_final("train_merge.csv","train_final.csv")

In [12]:
train_final = pd.read_csv(dpath+"train_final.csv")
train_final.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,song_length,genre_ids,language,mult_genre,city,...,gender,registered_via,registration_init_time,expiration_date,user_pop,item_pop,user_rate,item_rate,lfm_reco,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,1,5,4,-0.25183,10,8,0,1,...,2,7,8,13,15.581,0.795,0.506,0.474,0.59038,1
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,3,6,3,0.23361,4,8,0,13,...,0,9,7,13,1.991,-0.081,0.743,1.0,0.21555,1
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,3,6,3,-0.13422,4,8,0,13,...,0,9,7,13,1.991,-0.072,0.743,0.5,-0.08262,1
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,3,6,3,0.05294,17,0,0,13,...,0,9,7,13,1.991,-0.081,0.743,1.0,0.12441,1
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,1,5,4,-0.36784,1,8,0,1,...,2,7,8,13,15.581,1.211,0.506,0.364,0.55454,1


In [13]:
train_final.shape

(7377403, 21)