### LFM Model

In [12]:
from sklearn.tree import DecisionTreeClassifier
from lightgbm.sklearn import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import lightgbm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import scipy.io as sio

In [3]:
def lfm_train(train_path, F, alpha, beta, step):
    """
    train LFM model,get latent factor user_vec and event_vec
    Args:
        train_data: train_data for lfm
        F: user vector len, event vector len
        alpha:regularization factor
        beta: learning rate
        step: iteration number
    Return:
        dict: key eventid, value:np.ndarray
        dict: key userid, value:np.ndarray
    """
    user_vec = {}
    event_vec = {}
    count = 0
    for step in range(step):
        fin = open(train_path,"r+")
        start = 0
        #每次取一行，随机梯度下降？
        for line in fin:
            if start == 0:
                start += 1
                continue
            cols = line.strip().split(",")
            userid,eventid,label = cols[0],cols[1],cols[-2]
            if userid not in user_vec:
                user_vec[userid] = np.random.randn(F)
            if eventid not in event_vec:
                event_vec[eventid] = np.random.randn(F)
            #label是str，需转换为int
            delta = int(label)-lfm_score(user_vec[userid],event_vec[eventid])
            for i in range(F):
                user_vec[userid][i] += beta*(delta*event_vec[eventid][i]\
                                            -alpha*user_vec[userid][i])
                event_vec[eventid][i] += beta*(delta*user_vec[userid][i]\
                                            -alpha*event_vec[eventid][i])
            count += 1
            #第1轮不更新学习率
            if step == 0:
                continue
            #每2000个样本更新一次学习率
            if count%2000==0:
                beta *= 0.95
            if count%5000==0:
                print("step %d,count %d,learning rate %g:"%(step, count, beta))
    pickle.dump(user_vec,open("./package/LFM_user_vec.pkl","wb"))
    pickle.dump(event_vec,open("./package/LFM_event_vec.pkl","wb"))

def lfm_score(user_vector,event_vector):
    """
    user_vector and event_vector distance
    Args:
        user_vector: lfm model produce user vector
        event_vector: lfm model produce event vector
    Return:
         lfm recommend score
    """
    score = np.dot(user_vector, event_vector)/\
                (np.linalg.norm(user_vector)*np.linalg.norm(event_vector))
    return score

In [13]:
data_train = pd.read_csv("./data/data_train.csv")
data_train.shape

(15398, 10)

In [14]:
data_train.head()

Unnamed: 0,invited,user_reco,evt_p_reco,evt_c_reco,user_pop,frnd_infl,evt_pop,lfm_reco,interested,not_interested
0,0,0.0,0.980097,0.980097,0.000231,0.0,-3.9e-05,0.00548,0,0
1,0,0.0,0.1821861,0.1821861,0.000231,0.0,1.8e-05,-0.12674,0,0
2,0,143.279558,-1.0,-1.0,0.000231,0.0,0.000173,0.68928,1,0
3,0,0.0,1.175019,1.175019,0.000231,0.0,1.6e-05,0.04961,0,0
4,0,32.328494,2.239399e-07,2.239399e-07,0.000231,0.0,6.4e-05,0.1832,0,0


In [15]:
y_train = np.array(data_train.iloc[:,-2])
lfm_proba = np.array(data_train.iloc[:,-3])

In [17]:
auc = roc_auc_score(y_train,lfm_proba)
print(auc)

0.9992725911668442


In [16]:
pickle.dump(lfm_proba,open("./result/lfm_pred.pkl","wb"))
pickle.dump(y_train,open("./result/y_train.pkl","wb"))