## 在使用xLearn前，要先把資料轉成LibFFM的格式

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('train.tiny.csv')

In [8]:
df.head()

Unnamed: 0,Id,Label,I1,I2,I3,I4,I5,I6,I7,I8,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,10000000,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,10000001,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,10000002,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,10000003,0,,893,,,4392.0,,0.0,0.0,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,10000004,0,3.0,-1,,0.0,2.0,0.0,3.0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,


In [9]:
column = df.columns.tolist()[1:]

In [10]:
def FillNa(df):
    df=df.copy()
    Num_Features=df.select_dtypes(['float','int']).columns.tolist()
    Str_Features=df.select_dtypes(['object']).columns.tolist()
    df[Num_Features]=df[Num_Features].fillna(0)
    for col in Str_Features:
        df[col]=df[col].fillna('NAN')
        df[col]=df[col].replace('XNA','NAN')
    return df

In [11]:
df = FillNa(df)

In [13]:
Categories = df.select_dtypes('object').columns.tolist() #類別型變數
Features = [i for i in df.columns.tolist() if i != 'Label'] #除了y以外的欄位
Numerics = [i for i in df.columns.tolist() if i not in Categories+['Label']] #數值型變數

In [14]:
current_code = len(Numerics)
catdict = {}
catcodes = {}

In [15]:
for x in Numerics:
    catdict[x] = 0
for x in Categories:
    catdict[x] = 1

In [16]:
nrows = df.shape[0]
columns = len(Features)

In [17]:
with open("test_ffm.txt",'w') as text_file:
    for n, r in enumerate(range(nrows)):
        datastring = ""
        datarow = df.loc[r].to_dict()
        datastring += str(int(datarow['Label']))
        for i, x in enumerate(catdict.keys()):
            if(catdict[x]==0): #numerical variables
                datastring = datastring + " " + str(i) + ":" + str(i) + ":" + str(datarow[x])
            else: #when catdict[x]!=0 -> categorical variables
                if(x not in catcodes): #ohe
                    catcodes[x] = {}
                    current_code += 1 #current_code最初是數值型變數的個數
                    catcodes[x][datarow[x]] = current_code
                
                elif(datarow[x] not in catcodes[x]):
                    current_code += 1
                    catcodes[x][datarow[x]] = current_code
                # Let All Data as input form
                code = catcodes[x][datarow[x]]  
                datastring = datastring + " " + str(i) + ":" + str(int(code)) + ":1" 
            
        datastring = datastring + '\n'
        text_file.write(datastring)

## 開始使用xLearn建模 

Reference: https://xlearn-doc.readthedocs.io/en/latest/python_api.html

In [18]:
import xlearn as xl

# Training task
ffm_model = xl.create_ffm() # Use field-aware factorization machine
ffm_model.setTrain("train_ffm.txt")  # Training data
#ffm_model.setValidate("test_ffm.txt")  # Validation data

# param:
#  0. task: binary classification
#  1. learning rate(lr): 0.2
#  2. regular lambda(lambda): 0.002
#  3. evaluation metric(metric): acc, prec, f1, auc (classification) rmse, mae, mape (regression)
#  4. optimization(opt): sgd, adagrad, ftrl
param = {'task':'binary', 'lr':0.2,
         'lambda':0.002, 'metric':'acc'}

# Start to train
# The trained model will be stored in model.out
ffm_model.fit(param, './model.out')

ffm_model.cv(param)

In [19]:
# Prediction task
ffm_model.setTest("./test_ffm.txt")  # Test data

#ffm_model.setSigmoid()  # Convert output to 0-1
ffm_model.setSign() # Convert output to 0 or 1

# Start to predict
# The output result will be stored in output.txt
ffm_model.predict("./model.out", "./output.txt")

## In Sklearn 

In [5]:
import numpy as np
import xlearn as xl

# param:
#  0. binary classification
#  1. learning rate: 0.2
#  2. epoch number: 10 (auto early-stop)
#  3. evaluation metric: accuarcy
#  4. use sgd optimization method
ffm_model = xl.FFMModel(task='binary', 
                        lr=0.2, 
                        epoch=10, 
                        reg_lambda=0.002,
                        metric='acc')
# Start to train
# Directly use string to specify data source
ffm_model.fit('train_ffm.txt', 
              eval_set='test_ffm.txt')

# print model weights
print(ffm_model.weights)

# Generate predictions
y_pred = ffm_model.predict('test_ffm.txt')

(array([-1.23794e+00, -7.47523e-01,  1.04232e-05, ..., -4.39428e-09,
       -4.39428e-09, -4.39428e-09]), array([[1.12387e-06, 4.25162e-03, 3.00676e-02, ..., 1.00801e-02,
        1.37930e-02, 6.82052e-03],
       [2.06583e-02, 1.26570e-03, 5.31453e-03, ..., 2.10824e-02,
        1.32065e-02, 8.49536e-03],
       [1.31720e-02, 1.10965e-02, 7.27028e-03, ..., 2.10455e-02,
        2.92958e-03, 2.06397e-02],
       ...,
       [2.04417e-02, 3.52581e-02, 2.57519e-02, ..., 1.42209e-02,
        3.33449e-02, 3.81100e-02],
       [4.28868e-02, 1.81048e-02, 3.72769e-02, ..., 3.32392e-02,
        3.21346e-02, 3.84002e-02],
       [6.91316e-03, 3.84712e-02, 3.76509e-02, ..., 1.13450e-02,
        4.81112e-03, 3.05756e-02]]))
