<h3> Importing the dependencies </h3>

In [1]:
import pandas as pd
import xlearn as xl
train = pd.read_csv('SnackDataset.csv', encoding= 'unicode_escape')
import warnings
warnings.filterwarnings('ignore')

<h3> Creating dataframe for our model training </h3>

In [2]:
cols = ['Category','Price','Month','Purchase Made']
train_sub = train[cols]
dict_ls = {'Y':1, 'N':0}
train_sub['Purchase Made'].replace(dict_ls, inplace = True)

<h3> Splitting our model for training & testing </h3> 

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(train_sub, test_size = 0.3, random_state = 5)

In [5]:
numerics =['Price','Purchase Made']
categories = ['Category', 'Purchase Made', 'Month']
features = ['Price', 'Category', 'Month' ]

<h3> Converting our dataframe to libffm format for our model </h3>

In [6]:
def convert_to_ffm(df,type,numerics,categories,features):
    currentcode = len(numerics)
    catdict = {}
    catcodes = {}
    
    # Flagging categorical and numerical fields
    for x in numerics:
         catdict[x] = 0
    for x in categories:
         catdict[x] = 1
    
    nrows = df.shape[0]
    ncolumns = len(features)
    with open(str(type) + "_ffm.txt", "w") as text_file:
    
    # Looping over rows to convert each row to libffm format
        for n, r in enumerate(range(nrows)):
            datastring = ""
            datarow = df.iloc[r].to_dict()
            datastring += str(int(datarow['Purchase Made'])) # Set Target Variable here
             
            # For numerical fields, we are creating a dummy field here
            for i, x in enumerate(catdict.keys()):
                if(catdict[x]==0):
                    datastring = datastring + " "+str(i)+":"+ str(i)+":"+ str(datarow[x])
                else:
            
            # For a new field appearing in a training example
                    if(x not in catcodes):
                        catcodes[x] = {}
                        currentcode +=1
                        catcodes[x][datarow[x]] = currentcode #encoding the feature
             
            # For already encoded fields
                    elif(datarow[x] not in catcodes[x]):
                         currentcode +=1
                         catcodes[x][datarow[x]] = currentcode #encoding the feature
                     
                    code = catcodes[x][datarow[x]]
                    datastring = datastring + " "+str(i)+":"+ str(int(code))+":1"

            datastring += '\n'
            text_file.write(datastring)

<h3> Converting train dataset to libffm format </h3>

In [8]:
convert_to_ffm(X_train, 'train', numerics, categories, features )

<h3> Model Prediction using xlearn </h3>

In [9]:
ffm_model = xl.create_ffm()

ffm_model.setTrain("train_ffm.txt")

param = {'task':'binary', 
         'lr':0.2,
         'lambda':0.002, 
         'metric':'acc'}

# Start to train
# The trained model will be stored in model.out
ffm_model.fit(param, './model.out')

<h3> Converting test dataset to libffm format </h3>

In [12]:
convert_to_ffm(X_train, 'test', numerics, categories, features )

In [13]:
# Prediction task
ffm_model.setTest("test_ffm.txt") # Test data
ffm_model.setSign() # Convert output to 0-1

# Start to predict
# The output result will be stored in output.txt
ffm_model.predict("./model.out", "./output.txt")

<h3> Model Evaluation - AUC Score </h3>

In [14]:
import numpy as np
from sklearn.metrics import roc_auc_score

In [15]:
with open("./output.txt") as f:
    predictions = f.readlines()

with open("test_ffm.txt") as f:
    truths = f.readlines()

truths = np.array([float(truth.split(' ')[0]) for truth in truths])
predictions = np.array([float(prediction.strip('')) for prediction in predictions])

auc_score = roc_auc_score(truths, predictions)

In [16]:
auc_score

0.5