In [1]:
import pandas as pd
import numpy as np
import datetime

import xlearn as xl

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [4]:
h_trs = pd.read_csv("../input/historical_transactions.csv")

In [5]:
merchant = pd.read_csv("../input/merchants.csv")

In [6]:
h_trs = h_trs.dropna().reset_index(drop=True)

In [7]:
h_trs.purchase_date = pd.to_datetime(h_trs.purchase_date)

In [8]:
h_trs.purchase_date = h_trs.purchase_date.apply(lambda x:x.date())

In [9]:
h_trs["elapsed"] = (h_trs.purchase_date - datetime.date(2017, 1, 1)).dt.days

In [10]:
h_trs.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,elapsed
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25,1.0,16,37,175
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15,1.0,16,16,195
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09,1.0,16,37,220
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02,1.0,16,34,244
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10,1.0,16,37,68


In [11]:
merchant.columns

Index(['merchant_id', 'merchant_group_id', 'merchant_category_id',
       'subsector_id', 'numerical_1', 'numerical_2', 'category_1',
       'most_recent_sales_range', 'most_recent_purchases_range',
       'avg_sales_lag3', 'avg_purchases_lag3', 'active_months_lag3',
       'avg_sales_lag6', 'avg_purchases_lag6', 'active_months_lag6',
       'avg_sales_lag12', 'avg_purchases_lag12', 'active_months_lag12',
       'category_4', 'city_id', 'state_id', 'category_2'],
      dtype='object')

In [12]:
df = pd.merge(h_trs, merchant, on=["merchant_id", "merchant_category_id", "subsector_id", "category_1", "city_id", "state_id", "category_2"], how="left")

In [13]:
df = df.dropna().reset_index(drop=True)

In [14]:
df.shape

(17019922, 30)

In [15]:
numerics = [
    "installments",
    "month_lag",
    "elapsed",
    "numerical_1",
    "numerical_2",
    "avg_sales_lag3",
    "avg_purchases_lag3",
    "active_months_lag3",
    "avg_sales_lag6",
    "avg_purchases_lag6",
    "active_months_lag6",
    "avg_sales_lag12",
    "avg_purchases_lag12",
    "active_months_lag12"
]

categories = [
    "authorized_flag",
    "card_id",
    "city_id",
    "category_1",
    "category_3",
    "merchant_category_id",
    "merchant_id",
    "category_2",
    "state_id",
    "subsector_id",
    "merchant_group_id",
    "most_recent_sales_range",
    "most_recent_purchases_range",
    "category_4"
]

features = numerics + categories

In [17]:
def convert_to_ffm(df,type,numerics,categories,features):
    currentcode = len(numerics)
    catdict = {}
    catcodes = {}
    # Flagging categorical and numerical field
    for x in numerics:
        catdict[x] = 0
    for x in categories:
        catdict[x] = 1
        
    nrows = df.shape[0]
    ncolumns = len(features)
    with open("../input/" + str(type) + "_ffm.txt", "w") as text_file:
        
        # Looping over rows to convert each row to libffm format
        for n, r in enumerate(range(nrows)):
            datastring = ""
            datarow = df.iloc[r].to_dict()
            datastring += str(float(datarow['purchase_amount']))
            
            # For numerical fields, we are creating a dummy field here
            for i, x in enumerate(catdict.keys()):
                if(catdict[x]==0):
                    datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
                else:
                # For a new field appearing in a training example
                    if(x not in catcodes):
                        catcodes[x] = {}
                        currentcode +=1
                        catcodes[x][datarow[x]] = currentcode #encoding the feature
            
                    # For already encoded fields
                    elif(datarow[x] not in catcodes[x]):
                        currentcode +=1
                        catcodes[x][datarow[x]] = currentcode #encoding the feature
            
                    code = catcodes[x][datarow[x]]
                    datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"

            datastring += '\n'
            text_file.write(datastring)
            
            if n % 1000000 == 0:
                print("=== {} rows completed ===".format(n))

In [18]:
convert_to_ffm(df, "train", numerics, categories, features)

=== 0 rows completed ===
=== 1000000 rows completed ===
=== 2000000 rows completed ===
=== 3000000 rows completed ===
=== 4000000 rows completed ===
=== 5000000 rows completed ===
=== 6000000 rows completed ===
=== 7000000 rows completed ===
=== 8000000 rows completed ===
=== 9000000 rows completed ===
=== 10000000 rows completed ===
=== 11000000 rows completed ===
=== 12000000 rows completed ===
=== 13000000 rows completed ===
=== 14000000 rows completed ===
=== 15000000 rows completed ===
=== 16000000 rows completed ===
=== 17000000 rows completed ===


In [3]:
ffm_model = xl.create_ffm()

In [4]:
ffm_model.setTrain("../input/train_ffm.txt")

In [6]:
param = {
    'task':'reg', # ‘binary’ for classification, ‘reg’ for Regression
    'k':2,           # Size of latent factor
    'lr':0.1,        # Learning rate for GD
    'lambda':0.0002, # L2 Regularization Parameter
    'epoch':25       # Maximum number of Epochs
}

ffm_model.fit(param, "sample_model.out")

In [7]:
print("="*1)
print("="*2)
print("="*3)
print("="*4)
print("="*5)
print("="*6)
print("="*7)
print("="*8)
print("="*9)
print("="*10) 

=
==
===
====
=====


In [10]:
ffm_model.

<xlearn.xlearn.XLearn at 0x10ece27b8>

In [None]:
ffm_model = xl.create_ffm()
ffm_model.setTrain("criteo.tr.r100.gbdt0.ffm")
ffm_model.setValidate("criteo.va.r100.gbdt0.ffm")

param = {'task':'binary', # ‘binary’ for classification, ‘reg’ for Regression
         'k':2,           # Size of latent factor
         'lr':0.1,        # Learning rate for GD
         'lambda':0.0002, # L2 Regularization Parameter
         'Metric':'auc',  # Metric for monitoring validation set performance
         'epoch':25       # Maximum number of Epochs
        }

ffm_model.fit(param, "model.out")
ffm_model.setTest("criteo.va.r100.gbdt0.ffm")
ffm_model.setSigmoid()
ffm_model.predict("model.out", "output.txt")
