In [1]:
import numpy as np 
import pandas as pd 
from sklearn.datasets import make_classification  # to generate fake data
import subprocess  # to execute shell command
import os

In [2]:
# Dependency:
# Please make sure that you have compiled ffm-train and ffm-predict executable and put them at ../bin/
# source code: https://github.com/guestwalk/libffm
# you are supposed to clone the repo to your local machine and then follow the instructions to build executable

In [3]:
class FFMFormatPandas:
    """Helper to fit and transform pandas DataFrame into FFM-required format data
    
    Adapted from https://www.kaggle.com/mpearmain/pandas2libffm/
    
    Parameteres
    -----------
    decimal: int
        Number of decimal places to keep while transforming `float` values.
    
    Attributes
    ----------
    field_index_: dict
        Dictionary that maps field names to distinct integer indices.
    
    feature_index_: dict
        Dictionary that maps feature values to distinct integer indices.
    
    y: string
        Name of the label column in the DataFrame.
        
    Examples
    --------
    >>> df = pd.read_csv("train.csv")
    >>> y_col = "price"
    >>> ffm_formatter = FFMFormatPandas()
    >>> ffm_data = ffm_formatter.fit_transform(df)
    """
    def __init__(self, decimal=4):
        self.field_index_ = None
        self.feature_index_ = None
        self.y = None
        self.decimal = decimal

    def fit(self, df, y=None):
        self.y = y
        df_ffm = df[df.columns.difference([self.y])]
        if self.field_index_ is None:
            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}

        if self.feature_index_ is not None:
            last_idx = max(list(self.feature_index_.values()))

        if self.feature_index_ is None:
            self.feature_index_ = dict()
            last_idx = 0

        # for col in df.columns:
        for col, dtype in df.dtypes.to_dict().items():
            if dtype == 'O':
                vals = df[col].unique()
                for val in vals:
                    if pd.isnull(val):
                        continue
                    name = '{}_{}'.format(col, val)
                    if name not in self.feature_index_:
                        self.feature_index_[name] = last_idx
                        last_idx += 1
            self.feature_index_[col] = last_idx
            last_idx += 1
        return self

    def fit_transform(self, df, y=None):
        self.fit(df, y)
        return self.transform(df)

    def transform_row_(self, row, t):
        ffm = []
        if self.y != None:
            ffm.append(str(row.loc[row.index == self.y][0]))
        if self.y is None:
            ffm.append(str(0))

        for col, val in row.loc[row.index != self.y].to_dict().items():
            col_type = t[col]
            field_id = self.field_index_[col]
            if col_type.kind ==  'O':
                name = '{}_{}'.format(col, val)
                feat_id = self.feature_index_[name]
                val = 1
            elif col_type.kind == 'i':
                feat_id = self.feature_index_[col]
            elif col_type.kind == 'f':
                feat_id = self.feature_index_[col]
                val = round(val, self.decimal)
            ffm.append('{}:{}:{}'.format(field_id, feat_id, val))
        return ' '.join(ffm)

    def transform(self, df):
        t = df.dtypes.to_dict()
        return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})

In [4]:
train, y = make_classification(n_samples=1000, n_features=5, 
                               n_informative=2, n_redundant=2, n_classes=2, 
                               random_state=42)

# Note that in actual use, rescale numerical features in advance will be better
train=pd.DataFrame(train, columns=['int1','int2','int3','s1','s2'])
train['int3'] = train['int3'].map(int)  # convert to int
train['s1'] = round(np.log(abs(train['s1'] +1 ))).map(str)  # to fake categorical feature
train['s2'] = round(np.log(abs(train['s2'] +1 ))).map(str)  # to fake categorical feature
train['clicked'] = y


ffm_train = FFMFormatPandas()
ffm_train_data = ffm_train.fit_transform(train, y='clicked')

In [5]:
out_folder = "../data/ffm/"
out_file = "sklearn-example.train.dffm"
out_path = os.path.join(out_folder, out_file)
os.makedirs(out_folder, exist_ok=True)
ffm_train_data.to_csv(out_path, index=False, header=False)

In [6]:
model_folder = "../model/ffm/"
model_file = "sklearn-example.ffm"
model_path = os.path.join(model_folder, model_file)
os.makedirs(model_folder, exist_ok=True)
args = ["../bin/ffm-train", "-t", "20", "-k", "3", "-s", "4", out_path, model_path]
p = subprocess.Popen(args, stdout=subprocess.PIPE, shell=False)
(output, err) = p.communicate()  
p_status = p.wait()

if output is not None:
    print(output.decode())
if err is not None:
    print(err.decode())

First check if the text file has already been converted to binary format (0.0 seconds)
Binary file NOT found. Convert text file to binary file (0.0 seconds)
iter   tr_logloss      tr_time
   1      0.39803          0.0
   2      0.30753          0.0
   3      0.28940          0.0
   4      0.28243          0.0
   5      0.27929          0.0
   6      0.27652          0.0
   7      0.27364          0.0
   8      0.27196          0.0
   9      0.27009          0.0
  10      0.26772          0.0
  11      0.26713          0.0
  12      0.26600          0.0
  13      0.26394          0.0
  14      0.26363          0.0
  15      0.26236          0.0
  16      0.26142          0.0
  17      0.26022          0.0
  18      0.26012          0.0
  19      0.25857          0.0
  20      0.25843          0.0

