## df2libffm

将DataFrame数据格式转换为FM类算法的libffm数据格式，以Criteo's Kaggle display advertising challenge数据集为例。  
Reference: https://www.kaggle.com/scirpus/libffm-generator-lb-280  
Dataset：  http://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset

In [1]:
import numpy as np
import pandas as pd

### 数据读取

In [2]:
# 数据集过大，读取前1000行
data = pd.read_table('../raw/train.txt',header=None,sep='\t',nrows=1000)

In [3]:
# 0列为类别标签, 1到13列为数值型特征，14到39列为类别型特征
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2,181.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2,4.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2,245.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,0,,893,,,4392.0,,0.0,0,0.0,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,0,3.0,-1,,0.0,2.0,0.0,3.0,0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,


In [9]:
#缺失值填充
data.fillna(0,inplace=True)

### df2libffm类

In [15]:
class df2libffm:
    def __init__(self, feas_n, feas_c, feas_oh):
        self.catdict = {}
        for x in feas_n: self.catdict[x] = 0    #数值型特征
        for x in feas_c: self.catdict[x] = 1    #类别单值型特征
        for x in feas_oh: self.catdict[x] = 2   #one-hot后的类别多值型特征
        self.field_ids = {}
        self.feat_ids = {}
        self.fieldcode = 0
        self.featcode = 0
    
    #初始化
    def build(self, train, test):
        df = pd.concat([train[feas_n+feas_c],test[feas_n+feas_c]],axis=0)
        for n, r in enumerate(range(len(df))):
            datarow = df.iloc[r].to_dict()
            for i, x in enumerate(self.catdict.keys()):
                #数值型特征
                if(self.catdict[x]==0):
                    if(x not in self.field_ids):
                        self.field_ids[x] = self.fieldcode
                        self.fieldcode +=1
                        self.feat_ids[x] = self.featcode
                        self.featcode +=1
                #类别单值型特征
                if(self.catdict[x]==1):
                    if(x not in self.field_ids):
                        self.field_ids[x] = self.fieldcode
                        self.fieldcode +=1
                        self.feat_ids[x] = {}
                        self.feat_ids[x][datarow[x]] = self.featcode
                        self.featcode +=1
                    elif(datarow[x] not in self.feat_ids[x]):
                        self.feat_ids[x][datarow[x]] = self.featcode
                        self.featcode +=1
                #类别多值型特征
                if(self.catdict[x]==2):
                    if(x.split('_')[0] not in self.field_ids):
                        self.field_ids[x.split('_')[0]] = self.fieldcode
                        self.fieldcode +=1
                        self.feat_ids[x] = self.featcode
                        self.featcode +=1
                        
    #转换
    def gen(self, df, path, dtype):
        with open(path, "w") as text_file:
            for n, r in enumerate(range(len(df))):
                datastring = ""
                datarow = df.iloc[r].to_dict()
                #第一列：target
                if dtype=='train': datastring += str(int(datarow[0]))
                if dtype=='valid': datastring += str(int(datarow[0]))
                if dtype=='test':  datastring += str(int(0))
                #第二列开始：特征编码
                for i, x in enumerate(self.catdict.keys()):
                    if(self.catdict[x]==0):
                        datastring = datastring + " "+str(self.field_ids[x])+":"+ str(self.feat_ids[x])+":"+ str(str(datarow[x]))
                    if(self.catdict[x]==1):
                        datastring = datastring + " "+str(self.field_ids[x])+":"+ str(self.feat_ids[x][datarow[x]])+":1"
                    if(self.catdict[x]==2):
                        if datarow[x]==1:
                            datastring = datastring + " "+str(self.field_ids[x.split('_')[0]])+":"+ str(self.feat_ids[x])+":1"
                datastring += '\n'
                text_file.write(datastring)

### execute

In [16]:
#特征分类
feas_n = list(data.columns[1:14])
feas_c = list(data.columns[14:])
feas_oh = []

df_ffm = df2libffm(feas_n, feas_c, feas_oh)
df_ffm.build(data.iloc[:500],data.iloc[500:])

In [19]:
path = '../raw/libffm/train.txt'
df_ffm.gen(data, path, 'train')