## Kaggle event 推荐比赛 — 特征工程
By Johnkle
* 数据清洗与预处理
* 构建特征(包括协同过滤推荐度等复杂特征)

## 0.引入包

In [1]:
from __future__ import division

import itertools
import pickle
import datetime
import hashlib
import locale
import numpy as np
import pycountry
import scipy.io as sio
import scipy.sparse as ss
import scipy.spatial.distance as ssd

from collections import defaultdict
from sklearn.preprocessing import normalize

## 1.数据清洗类

In [19]:
class DataCleaner():
    """
    Common utilities for converting strings to equivalent numbers
    or number buckets.
    """
    def __init__(self):
        # 载入 locales
        self.localeIdMap = defaultdict(int)
        for i, l in enumerate(locale.locale_alias.keys()):
            self.localeIdMap[l] = i + 1
        # 载入 countries
        self.countryIdMap = defaultdict(int)
        ctryIdx = defaultdict(int)
        for i, c in enumerate(pycountry.countries):
            self.countryIdMap[c.name.lower()] = i + 1
            if c.name.lower() == "usa":
                ctryIdx["US"] = i
            if c.name.lower() == "canada":
                ctryIdx["CA"] = i
        for cc in ctryIdx.keys():
            for s in pycountry.subdivisions.get(country_code=cc):
                self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
            # 载入 gender id 字典
            self.genderIdMap = defaultdict(int, {"male":1, "female":2})

    def getLocaleId(self, locstr):
        return self.localeIdMap[locstr.lower()]

    def getGenderId(self, genderStr):
        return self.genderIdMap[genderStr]

    def getJoinedYearMonth(self, dateString):
        dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
        return "".join([str(dttm.year), str(dttm.month)])

    def getCountryId(self, location):
        if (isinstance(location, str)
            and len(location.strip()) > 0
            and location.rfind("  ") > -1):
            return self.countryIdMap[location[location.rindex("  ") + 2:].lower()]
        else:
            return 0

    def getBirthYearInt(self, birthYear):
        try:
            return 0 if birthYear == "None" else int(birthYear)
        except:
            return 0

    def getTimezoneInt(self, timezone):
        try:
            return int(timezone)
        except:
            return 0

    def getFeatureHash(self, value):
        if len(value.strip()) == 0:
            return -1
        else:
            return int(hashlib.sha224(value).hexdigest()[0:4], 16)

    def getFloatValue(self, value):
        if len(value.strip()) == 0:
            return 0.0
        else:
            return float(value)

## 2.处理user和event关联数据

In [20]:
class ProgramEntities():
  """
  我们只关心train和test中出现的user和event，因此重点处理这部分关联数据
  """
  def __init__(self):
    # 统计训练集中有多少独立的用户的events
    dpath = "./data/"
    uniqueUsers = set()
    uniqueEvents = set()
    eventsForUser = defaultdict(set)
    usersForEvent = defaultdict(set)
    for filename in [dpath+"train.csv", dpath+"test.csv"]:
        f = open(filename, 'r')
        f.readline().strip().split(",")
        for line in f:
            cols = line.strip().split(",")
            uniqueUsers.add(cols[0])
            uniqueEvents.add(cols[1])
            eventsForUser[cols[0]].add(cols[1])
            usersForEvent[cols[1]].add(cols[0])
        f.close()
    pickle.dump(uniqueUsers, open("./package/PE_uniqueUsers.pkl", 'wb'))
    pickle.dump(uniqueEvents, open("./package/PE_uniqueEvents.pkl", 'wb'))
    pickle.dump(eventsForUser, open("./package/PE_eventsForUser.pkl", 'wb'))
    pickle.dump(usersForEvent, open("./package/PE_usersForEvent.pkl", 'wb'))
    self.userEventScores = ss.dok_matrix((len(uniqueUsers), len(uniqueEvents)))
    self.userIndex = dict()
    self.eventIndex = dict()
    for i, u in enumerate(uniqueUsers):
        self.userIndex[u] = i
    for i, e in enumerate(uniqueEvents):
        self.eventIndex[e] = i
    ftrain = open(dpath+"train.csv", 'r')
    ftrain.readline()
    for line in ftrain:
        cols = line.strip().split(",")
        i = self.userIndex[cols[0]]
        j = self.eventIndex[cols[1]]
        self.userEventScores[i, j] = int(cols[4]) - int(cols[5])
    ftrain.close()
    pickle.dump(self.userIndex, open("./package/PE_userIndex.pkl", 'wb'))
    pickle.dump(self.eventIndex, open("./package/PE_eventIndex.pkl", 'wb'))
    sio.mmwrite("./package/PE_userEventScores", self.userEventScores)
    # 为了防止不必要的计算，我们找出来所有关联的用户 或者 关联的event
    # 所谓的关联用户，指的是至少在同一个event上有行为的用户pair
    # 关联的event指的是至少同一个user有行为的event pair
    self.uniqueUserPairs = set()
    self.uniqueEventPairs = set()
    for event in uniqueEvents:
        users = usersForEvent[event]
        if len(users) > 2:
            self.uniqueUserPairs.update(itertools.combinations(users, 2))
    for user in uniqueUsers:
        events = eventsForUser[user]
        if len(events) > 2:
            self.uniqueEventPairs.update(itertools.combinations(events, 2))
    pickle.dump(self.uniqueUserPairs, open("./package/PE_uniqueUserPairs.pkl", 'wb'))
    pickle.dump(self.uniqueEventPairs, open("./package/PE_uniqueEventPairs.pkl", 'wb'))

In [3]:
%%time
pe = ProgramEntities()

Wall time: 1.4 s


## 3.用户与用户相似度矩阵

In [31]:
class Users():
    """
    构建 user/user 相似度矩阵
    """
    def __init__(self,sim=ssd.correlation):
        userIndex = pickle.load(open("./package/PE_userIndex.pkl", 'rb'))
        uniqueUserPairs = pickle.load(open("./package/PE_uniqueUserPairs.pkl", 'rb'))
        cleaner = DataCleaner()
        nusers = len(userIndex.keys())
        fin = open("./data/users.csv", 'r')
        colnames = fin.readline().strip().split(",")
        self.userMatrix = ss.dok_matrix((nusers, len(colnames) - 1))
        for line in fin:
            cols = line.strip().split(",")
          # 只考虑train.csv中出现的用户
            if cols[0] in userIndex:
                i = userIndex[cols[0]]
                self.userMatrix[i, 0] = cleaner.getLocaleId(cols[1])
                self.userMatrix[i, 1] = cleaner.getBirthYearInt(cols[2])
                self.userMatrix[i, 2] = cleaner.getGenderId(cols[3])
                self.userMatrix[i, 3] = cleaner.getJoinedYearMonth(cols[4])
                self.userMatrix[i, 4] = cleaner.getCountryId(cols[5])
                self.userMatrix[i, 5] = cleaner.getTimezoneInt(cols[6])
        fin.close()
        # 归一化用户矩阵
        self.userMatrix = normalize(self.userMatrix, norm="l1", axis=0, copy=False)
        sio.mmwrite("./package/US_userMatrix", self.userMatrix)
        # 计算用户相似度矩阵，之后会用到
        self.userSimMatrix = ss.dok_matrix((nusers, nusers))
        for i in range(0, nusers):
            self.userSimMatrix[i, i] = 1.0
        for u1, u2 in uniqueUserPairs:
            i = userIndex[u1]
            j = userIndex[u2]
            if (i,j) not in self.userSimMatrix:
                usim = sim(self.userMatrix.getrow(i).todense(),\
                           self.userMatrix.getrow(j).todense())
                self.userSimMatrix[i, j] = usim
                self.userSimMatrix[j, i] = usim
        sio.mmwrite("./package/US_userSimMatrix", self.userSimMatrix)

In [32]:
Users()

<__main__.Users at 0x244872f5828>

## 4.用户社交关系挖掘

In [38]:
class UserFriends():
    """
    找出某用户的那些朋友，想法非常简单
    1)如果你有更多的朋友，可能你性格外向，更容易参加各种活动
    2)如果你朋友会参加某个活动，可能你也会跟随去参加一下
    """
    def __init__(self):
        userIndex = pickle.load(open("./package/PE_userIndex.pkl", 'rb'))
        eventsForUser = pickle.load(open("./package/PE_eventsForUser.pkl", 'rb'))
        userEventScores = sio.mmread("./package/PE_userEventScores")
        nusers = len(userIndex.keys())
        self.numFriends = np.zeros((nusers))
        self.userFriends = ss.dok_matrix((nusers, nusers))
        fin = open("./data/user_friends.csv", 'r')
        fin.readline()                # skip header
        ln = 0
        for line in fin:
            if ln % 200 == 0:
                print ("Loading line: ", ln)
            cols = line.strip().split(",")
            user = cols[0]
            if user in userIndex:
                friends = cols[1].split(" ")
                i = userIndex[user]
                self.numFriends[i] = len(friends)
                for friend in friends:
                    if friend in userIndex:
                        j = userIndex[friend]
                        #eventsForUser不是变量，为什么赋值，导致userFriends无值
                        eventsForUser = userEventScores.getrow(j).todense()
                        score = eventsForUser.sum() / np.shape(eventsForUser)[1]
                        self.userFriends[i, j] += score
                        self.userFriends[j, i] += score
            ln += 1
        fin.close()
        # 归一化数组
        sumNumFriends = self.numFriends.sum(axis=0)
        self.numFriends = self.numFriends / sumNumFriends
        sio.mmwrite("./package/UF_numFriends", np.matrix(self.numFriends))
        self.userFriends = normalize(self.userFriends, norm="l1", axis=0, copy=False)
        sio.mmwrite("./package/UF_userFriends", self.userFriends)

In [39]:
UserFriends()

Loading line:  0
Loading line:  200
Loading line:  400
Loading line:  600
Loading line:  800
Loading line:  1000
Loading line:  1200
Loading line:  1400
Loading line:  1600
Loading line:  1800
Loading line:  2000
Loading line:  2200
Loading line:  2400
Loading line:  2600
Loading line:  2800
Loading line:  3000
Loading line:  3200
Loading line:  3400
Loading line:  3600
Loading line:  3800
Loading line:  4000
Loading line:  4200
Loading line:  4400
Loading line:  4600
Loading line:  4800
Loading line:  5000
Loading line:  5200
Loading line:  5400
Loading line:  5600
Loading line:  5800
Loading line:  6000
Loading line:  6200
Loading line:  6400
Loading line:  6600
Loading line:  6800
Loading line:  7000
Loading line:  7200
Loading line:  7400
Loading line:  7600
Loading line:  7800
Loading line:  8000
Loading line:  8200
Loading line:  8400
Loading line:  8600
Loading line:  8800
Loading line:  9000
Loading line:  9200
Loading line:  9400
Loading line:  9600
Loading line:  9800
Loading

<__main__.UserFriends at 0x244859630f0>

## 5.构造event和event相似度数据

In [53]:
class Events():
    """
    构建event-event相似度，注意这里有2种相似度：
    1）由用户-event行为，类似协同过滤算出的相似度
    2）由event本身的内容(event信息)计算出的event-event相似度
    """
    def __init__(self, psim=ssd.correlation, csim=ssd.cosine):
        eventIndex = pickle.load(open("./package/PE_eventIndex.pkl", 'rb'))
        uniqueEventPairs = pickle.load(open("./package/PE_uniqueEventPairs.pkl", 'rb'))
        cleaner = DataCleaner()
        fin = open("./data/events.csv", 'r')
        fin.readline() # skip header
        nevents = len(eventIndex.keys())
        self.eventPropMatrix = ss.dok_matrix((nevents, 7))
        self.eventContMatrix = ss.dok_matrix((nevents, 100))
        ln = 0
        for line in fin.readlines():
    #      if ln > 10:
    #        break
            cols = line.strip().split(",")
            eventId = cols[0]
            if eventId in eventIndex:
                i = eventIndex[eventId]
                self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth(cols[2]) # start_time
                #Unicode-objects must be encoded before hashing
                self.eventPropMatrix[i, 1] = cleaner.getFeatureHash(cols[3].encode('utf-8')) # city
                self.eventPropMatrix[i, 2] = cleaner.getFeatureHash(cols[4].encode('utf-8')) # state
                self.eventPropMatrix[i, 3] = cleaner.getFeatureHash(cols[5].encode('utf-8')) # zip
                self.eventPropMatrix[i, 4] = cleaner.getFeatureHash(cols[6].encode('utf-8')) # country
                self.eventPropMatrix[i, 5] = cleaner.getFloatValue(cols[7]) # lat
                self.eventPropMatrix[i, 6] = cleaner.getFloatValue(cols[8]) # lon
                for j in range(9, 109):
                    self.eventContMatrix[i, j-9] = cols[j]
            ln += 1
        fin.close()
        self.eventPropMatrix = normalize(self.eventPropMatrix,
            norm="l1", axis=0, copy=False)
        sio.mmwrite("./package/EV_eventPropMatrix", self.eventPropMatrix)
        self.eventContMatrix = normalize(self.eventContMatrix,
            norm="l1", axis=0, copy=False)
        sio.mmwrite("./package/EV_eventContMatrix", self.eventContMatrix)
        # calculate similarity between event pairs based on the two matrices    
        self.eventPropSim = ss.dok_matrix((nevents, nevents))
        self.eventContSim = ss.dok_matrix((nevents, nevents))
        for e1, e2 in uniqueEventPairs:
            i = eventIndex[e1]
            j = eventIndex[e2]
            if (i,j) not in self.eventPropSim:
                epsim = psim(self.eventPropMatrix.getrow(i).todense(),
                  self.eventPropMatrix.getrow(j).todense())
                self.eventPropSim[i, j] = epsim
                self.eventPropSim[j, i] = epsim
            if (i,j) not in self.eventContSim:
                ecsim = csim(self.eventContMatrix.getrow(i).todense(),\
                             self.eventContMatrix.getrow(j).todense())
                self.eventContSim[i, j] = epsim
                self.eventContSim[j, i] = epsim
        sio.mmwrite("./package/EV_eventPropSim", self.eventPropSim)
        sio.mmwrite("./package/EV_eventContSim", self.eventContSim)

In [54]:
%%time
Events()

  dist = 1.0 - uv / np.sqrt(uu * vv)


Wall time: 7min 59s


<__main__.Events at 0x24484e786d8>

## 6.活跃度/event热度 数据

In [29]:
class EventAttendees():
    """
    统计某个活动，参加和不参加的人数，从而为活动活跃度做准备
    """
    def __init__(self):
        eventIndex = pickle.load(open("./package/PE_eventIndex.pkl", 'rb'))
        nevents = len(eventIndex.keys())
        self.eventPopularity = ss.dok_matrix((nevents, 1))
        f = open("./data/event_attendees.csv", 'rb')
        f.readline() # skip header
        for line in f:
            cols = line.strip().split(",")
            eventId = cols[0]
            if eventId in eventIndex:
                i = eventIndex[eventId]
                self.eventPopularity[i, 0] = \
                len(cols[1].split(" ")) - len(cols[4].split(" "))
        f.close()
        self.eventPopularity = normalize(self.eventPopularity, norm="l1",
                                         axis=0, copy=False)
        sio.mmwrite("./package/EA_eventPopularity", self.eventPopularity)

In [59]:
EventAttendees()

<__main__.EventAttendees at 0x244850c8240>

## LFM隐语义推荐度

In [6]:
def lfm_train(train_path, F, alpha, beta, step):
    """
    train LFM model,get latent factor user_vec and event_vec
    Args:
        train_data: train_data for lfm
        F: user vector len, event vector len
        alpha:regularization factor
        beta: learning rate
        step: iteration number
    Return:
        dict: key eventid, value:np.ndarray
        dict: key userid, value:np.ndarray
    """
    user_vec = {}
    event_vec = {}
    count = 0
    for step in range(step):
        fin = open(train_path,"r+")
        start = 0
        #每次取一行，随机梯度下降？
        for line in fin:
            if start == 0:
                start += 1
                continue
            cols = line.strip().split(",")
            userid,eventid,label = cols[0],cols[1],cols[-2]
            if userid not in user_vec:
                user_vec[userid] = np.random.randn(F)
            if eventid not in event_vec:
                event_vec[eventid] = np.random.randn(F)
            #label是str，需转换为int
            delta = int(label)-lfm_score(user_vec[userid],event_vec[eventid])
            for i in range(F):
                user_vec[userid][i] += beta*(delta*event_vec[eventid][i]\
                                            -alpha*user_vec[userid][i])
                event_vec[eventid][i] += beta*(delta*user_vec[userid][i]\
                                            -alpha*event_vec[eventid][i])
            count += 1
            #第1轮不更新学习率
            if step == 0:
                continue
            #每2000个样本更新一次学习率
            if count%2000==0:
                beta *= 0.95
            if count%5000==0:
                print("step %d,count %d,learning rate %g:"%(step, count, beta))
    pickle.dump(user_vec,open("./package/LFM_user_vec.pkl","wb"))
    pickle.dump(event_vec,open("./package/LFM_event_vec.pkl","wb"))

def lfm_score(user_vector,event_vector):
    """
    user_vector and event_vector distance
    Args:
        user_vector: lfm model produce user vector
        event_vector: lfm model produce event vector
    Return:
         lfm recommend score
    """
    score = np.dot(user_vector, event_vector)/\
                (np.linalg.norm(user_vector)*np.linalg.norm(event_vector))
    return score

In [7]:
%%time
lfm_train("./data/train.csv", 40, 0.01, 0.1, 20)  

step 1,count 20000,learning rate 0.0857375:
step 1,count 25000,learning rate 0.0773781:
step 1,count 30000,learning rate 0.066342:
step 2,count 35000,learning rate 0.0598737:
step 2,count 40000,learning rate 0.0513342:
step 2,count 45000,learning rate 0.0463291:
step 3,count 50000,learning rate 0.0397214:
step 3,count 55000,learning rate 0.0358486:
step 3,count 60000,learning rate 0.0307357:
step 4,count 65000,learning rate 0.027739:
step 4,count 70000,learning rate 0.0237827:
step 4,count 75000,learning rate 0.0214639:
step 5,count 80000,learning rate 0.0184026:
step 5,count 85000,learning rate 0.0166083:
step 5,count 90000,learning rate 0.0142396:
step 6,count 95000,learning rate 0.0128512:
step 6,count 100000,learning rate 0.0110183:
step 6,count 105000,learning rate 0.00994403:
step 7,count 110000,learning rate 0.00852576:
step 7,count 115000,learning rate 0.0076945:
step 7,count 120000,learning rate 0.00659707:
step 8,count 125000,learning rate 0.00595386:
step 8,count 130000,lear

## 7.串起所有的数据处理和准备流程

In [30]:
%%time
def data_prepare():
    """
    计算生成所有的数据，用矩阵或者其他形式存储方便后续提取特征和建模
    """
    #print "第1步：统计user和event相关信息..."
    #pe = ProgramEntities()
    #print "第1步完成...\n"
    
    print ("第2步：计算用户相似度信息，并用矩阵形式存储...")
    Users()
    print ("第2步完成...\n")
    
    print ("第3步：计算用户社交关系信息，并存储...")
    UserFriends()
    print ("第3步完成...\n")
    
    print ("第4步：计算event相似度信息，并用矩阵形式存储...")
    Events()
    print ("第4步完成...\n")
    
    print ("第5步：计算event热度信息...")
    EventAttendees()
    print ("第5步完成...\n")

# 运行进行数据准备
data_prepare()

第2步：计算用户相似度信息，并用矩阵形式存储...


AttributeError: has_key not found

## 8.构建特征

In [18]:
# 这是构建特征部分
from __future__ import division

import pickle
import numpy as np
import scipy.io as sio

class DataRewriter:
    def __init__(self):
        # 读入数据做初始化
        self.userIndex = pickle.load(open("./package/PE_userIndex.pkl", 'rb'))
        self.eventIndex = pickle.load(open("./package/PE_eventIndex.pkl", 'rb'))
        self.userEventScores = sio.mmread("./package/PE_userEventScores").todense()
        self.userSimMatrix = sio.mmread("./package/US_userSimMatrix").todense()
        self.eventPropSim = sio.mmread("./package/EV_eventPropSim").todense()
        self.eventContSim = sio.mmread("./package/EV_eventContSim").todense()
        self.numFriends = sio.mmread("./package/UF_numFriends")
        self.userFriends = sio.mmread("./package/UF_userFriends").todense()
        self.eventPopularity = sio.mmread("./package/EA_eventPopularity").todense()
        self.user_vec = pickle.load(open("./package/LFM_user_vec.pkl", 'rb'))
        self.event_vec = pickle.load(open("./package/LFM_event_vec.pkl", 'rb'))
    
    def userReco(self, userId, eventId):
        """
        根据User-based协同过滤，得到event的推荐度
        基本的伪代码思路如下：
        for item i
          for every other user v that has a preference for i
            compute similarity s between u and v
            incorporate v's preference for i weighted by s into running aversge
        return top items ranked by weighted average
        """
        i = self.userIndex[userId]
        j = self.eventIndex[eventId]
        vs = self.userEventScores[:, j]
        sims = self.userSimMatrix[i, :]
        prod = sims * vs
        try:
            return prod[0, 0] - self.userEventScores[i, j]
        except IndexError:
            return 0

    def eventReco(self, userId, eventId):
        """
        根据基于物品的协同过滤，得到Event的推荐度
        基本的伪代码思路如下：
        for item i 
          for every item j tht u has a preference for
            compute similarity s between i and j
            add u's preference for j weighted by s to a running average
        return top items, ranked by weighted average
        """
        i = self.userIndex[userId]
        j = self.eventIndex[eventId]
        js = self.userEventScores[i, :]
        psim = self.eventPropSim[:, j]
        csim = self.eventContSim[:, j]
        pprod = js * psim
        cprod = js * csim
        pscore = 0
        cscore = 0
        try:
            pscore = pprod[0, 0] - self.userEventScores[i, j]
        except IndexError:
            pass
        try:
            cscore = cprod[0, 0] - self.userEventScores[i, j]
        except IndexError:
            pass
        return pscore, cscore

    def userPop(self, userId):
        """
        基于用户的朋友个数来推断用户的社交程度
        主要的考量是如果用户的朋友非常多，可能会更倾向于参加各种社交活动
        """
        if userId in self.userIndex:
            i = self.userIndex[userId]
            try:
                return self.numFriends[0, i]
            except IndexError:
                return 0
        else:
            return 0

    def friendInfluence(self, userId):
        """
        朋友对用户的影响
        主要考虑用户所有的朋友中，有多少是非常喜欢参加各种社交活动/event的
        用户的朋友圈如果都积极参与各种event，可能会对当前用户有一定的影响
        """
        nusers = np.shape(self.userFriends)[1]
        i = self.userIndex[userId]
        return (self.userFriends[i, :].sum(axis=0) / nusers)[0,0]

    def eventPop(self, eventId):
        """
        本活动本身的热度
        主要是通过参与的人数来界定的
        """
        i = self.eventIndex[eventId]
        return self.eventPopularity[i, 0]
    
    def lfmReco(self,userId,eventId):
        """
        基于LFM的推荐度
        """
        #为什么不加这句会报错
        lfm_res = 0.
        if userId in self.user_vec and eventId in self.event_vec:
            lfm_res = lfm_score(self.user_vec[userId],self.event_vec[eventId])
            lfm_res = np.around(lfm_res,decimals=5)
        return lfm_res

    def rewriteData(self, start=1, train=True, header=True):
        """
        把前面user-based协同过滤 和 item-based协同过滤，以及各种热度和影响度作为特征组合在一起
        生成新的训练数据，用于分类器分类使用
        """
        dpath = "./data/"
        fn = "train.csv" if train else "test.csv"
        fin = open(dpath+fn, 'r')
        fout = open(dpath + "data_" + fn, 'w')
        # write output header
        if header:
            ocolnames = ["invited", "user_reco", "evt_p_reco","evt_c_reco",\
                         "user_pop", "frnd_infl", "evt_pop", "lfm_reco"]
            if train:
                ocolnames.append("interested")
                ocolnames.append("not_interested")
            fout.write(",".join(ocolnames) + "\n")
        ln = 0
        for line in fin:
            ln += 1
            if ln < start:
                continue
            cols = line.strip().split(",")
            userId = cols[0]
            eventId = cols[1]
            invited = cols[2]
            if ln%500 == 0:
                print ("%s:%d (userId, eventId)=(%s, %s)" % (fn, ln, userId, eventId))
            user_reco = self.userReco(userId, eventId)
            evt_p_reco, evt_c_reco = self.eventReco(userId, eventId)
            user_pop = self.userPop(userId)
            frnd_infl = self.friendInfluence(userId)
            evt_pop = self.eventPop(eventId)
            lfm_reco = self.lfmReco(userId,eventId)
            ocols = [invited, user_reco, evt_p_reco,evt_c_reco,\
                     user_pop, frnd_infl, evt_pop, lfm_reco]
            if train:
                ocols.append(cols[4]) # interested
                ocols.append(cols[5]) # not_interested
            fout.write(",".join(map(lambda x: str(x), ocols)) + "\n")
        fin.close()
        fout.close()

    def rewriteTrainingSet(self):
        self.rewriteData(True)

    def rewriteTestSet(self):
        self.rewriteData(False)

In [19]:
%%time
dr = DataRewriter()
print ("生成训练数据...\n")
dr.rewriteData(train=True, start=2, header=True)
print ("生成预测数据...\n")
dr.rewriteData(train=False, start=2, header=True)

生成训练数据...

train.csv:500 (userId, eventId)=(123290209, 1887085024)
train.csv:1000 (userId, eventId)=(272886293, 199858305)
train.csv:1500 (userId, eventId)=(395305791, 1582270949)
train.csv:2000 (userId, eventId)=(527523423, 3272728211)
train.csv:2500 (userId, eventId)=(651258472, 792632006)
train.csv:3000 (userId, eventId)=(811791433, 524756826)
train.csv:3500 (userId, eventId)=(985547042, 1269035551)
train.csv:4000 (userId, eventId)=(1107615001, 173949238)
train.csv:4500 (userId, eventId)=(1236336671, 3849306291)
train.csv:5000 (userId, eventId)=(1414301782, 2652356640)
train.csv:5500 (userId, eventId)=(1595465532, 955398943)
train.csv:6000 (userId, eventId)=(1747091728, 2131379889)
train.csv:6500 (userId, eventId)=(1914182220, 955398943)
train.csv:7000 (userId, eventId)=(2071842684, 1076364848)
train.csv:7500 (userId, eventId)=(2217853337, 3051438735)
train.csv:8000 (userId, eventId)=(2338481531, 2525447278)
train.csv:8500 (userId, eventId)=(2489551967, 520657921)
train.csv:9000 (us