In [1]:
import pandas as pd
import numpy as np
import xlearn as xl
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
# Read data
# User data
header_user = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
data_user = pd.read_csv('data/u.user', sep='|', names=header_user)
data_user = data_user.drop(['zip_code','age'], axis=1)

# Item data
header_item = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children',
        'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
        'Thriller', 'War', 'Western']
data_item = pd.read_csv('data/u.item', sep='|', names=header_item, encoding = "ISO-8859-1")
data_item = data_item.drop(['release_date', 'title', 'video_release_date', 'IMDb_URL'], axis=1)

# Training data
header_data = ['user_id', 'item_id', 'rating', 'timestamp']
data_training = pd.read_csv('data/ub.base', sep='\t', names=header_data)
# Test data
data_test = pd.read_csv('data/ub.test', sep='\t', names=header_data)

In [3]:
data_item.head()

Unnamed: 0,item_id,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [4]:
data_user.head()

Unnamed: 0,user_id,gender,occupation
0,1,M,technician
1,2,F,other
2,3,M,writer
3,4,M,technician
4,5,F,other


In [5]:
# One hot encoding
data_user_onehot = pd.get_dummies(data_user)
data_item_onehot = data_item

In [6]:
data_user_onehot.head()

Unnamed: 0,user_id,gender_F,gender_M,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,occupation_engineer,occupation_entertainment,occupation_executive,...,occupation_marketing,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer
0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,3,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,5,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [7]:
data_item_onehot.head()

Unnamed: 0,item_id,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [8]:
# Data merge and label separation
data_training_merge = data_training.merge(data_user_onehot, left_on="user_id", right_on="user_id")
data_training_merge = data_training_merge.merge(data_item_onehot, left_on="item_id", right_on="item_id")
data_training_merge["rating"] = np.where(data_training_merge["rating"]==5, 1, 0)
X_train = data_training_merge.drop(["timestamp", "user_id", "item_id"], axis=1)
X_train.to_csv("data/data_fm_training.csv", index=None, header=None)

data_test_merge = data_test.merge(data_user_onehot, left_on="user_id", right_on="user_id")
data_test_merge = data_test_merge.merge(data_item_onehot, left_on="item_id", right_on="item_id")
data_test_merge["rating"] = np.where(data_test_merge["rating"]==5, 1, 0)
X_test = data_test_merge.drop(["timestamp", "user_id", "item_id"], axis=1)
X_test.to_csv("data/data_fm_test.csv",index=None, header=None)

# Factorization Machine

In [36]:
fm_model = xl.create_fm()  # Use factorization machine

fm_model.setTrain("data/data_fm_training.csv")  # Training data
fm_model.setValidate("data/data_fm_test.csv")  # Validation data

# param:
#  0. Binary classification task
#  1. learning rate: 0.2
#  2. lambda: 0.002
#  3. metric: accuracy
# param = {'task':'binary', 'lr':0.2, 'lambda':0.002, 'metric': 'acc'}
# param = {'task':'binary', 'lr':0.2, 'lambda':0.002, 'metric': 'prec'}
param = {'task':'binary', 'lr':0.8, 'lambda':0, 'metric': 'f1', 'k':10, 'opt':'sgd', 'epoch':200, 'stop_window':10}
# param = {'task':'binary', 'lr':0.2, 'lambda':0.002, 'metric': 'auc'}

# Start to train
# The trained model will be stored in model.out
fm_model.fit(param, 'model/fm_model.out')

# Prediction task
fm_model.setTest("data/data_fm_test.csv")  # Test data
# fm_model.setSign()     # Convert prediction to 0 and 1
fm_model.setSigmoid() 

# Start to predict
# The output result will be stored in output.txt
fm_model.predict("model/fm_model.out", "model/fm_output.txt")

# Field aware Factorization Machine

In [22]:
class FFMFormatPandas:
    def __init__(self):
        self.field_index_ = None
        self.feature_index_ = None
        self.y = None

    def fit(self, df, y=None):
        self.y = y
        df_ffm = df[df.columns.difference([self.y])]
        if self.field_index_ is None:
            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}

        if self.feature_index_ is not None:
            last_idx = max(list(self.feature_index_.values()))

        if self.feature_index_ is None:
            self.feature_index_ = dict()
            last_idx = 0

        for col in df.columns:
            vals = df[col].unique()
            for val in vals:
                if pd.isnull(val):
                    continue
                name = '{}_{}'.format(col, val)
                if name not in self.feature_index_:
                    self.feature_index_[name] = last_idx
                    last_idx += 1
            self.feature_index_[col] = last_idx
            last_idx += 1
        return self

    def fit_transform(self, df, y=None):
        self.fit(df, y)
        return self.transform(df)

    def transform_row_(self, row, t):
        ffm = []
        if self.y != None:
            ffm.append(str(row.loc[row.index == self.y][0]))
        if self.y is None:
            ffm.append(str(0))

        for col, val in row.loc[row.index != self.y].to_dict().items():
            col_type = t[col]
            name = '{}_{}'.format(col, val)
            if col_type.kind ==  'O':
                ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
            elif col_type.kind == 'i':
                ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
        return ' '.join(ffm)

    def transform(self, df):
        t = df.dtypes.to_dict()
        return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})

In [23]:
########################### Libffm format conversion ############################

ffm_train = FFMFormatPandas()
ffm_train_data = ffm_train.fit_transform(X_train, y='rating')
ffm_test_data = ffm_train.transform(X_test)

In [24]:
ffm_train_data

0        1 41:74:0 0:77:0 1:80:0 2:83:1 3:86:1 4:89:1 5...
1        0 41:74:0 0:77:0 1:80:0 2:83:1 3:86:1 4:89:1 5...
2        0 41:74:0 0:77:0 1:80:0 2:83:1 3:86:1 4:89:1 5...
3        0 41:74:0 0:77:0 1:80:0 2:83:1 3:86:1 4:89:1 5...
4        0 41:74:0 0:77:0 1:80:0 2:83:1 3:86:1 4:89:1 5...
                               ...                        
90565    0 41:74:0 0:77:0 1:80:0 2:83:0 3:86:0 4:89:0 5...
90566    0 41:74:0 0:77:0 1:80:0 2:83:0 3:86:0 4:89:0 5...
90567    0 41:74:0 0:77:0 1:80:0 2:83:0 3:86:0 4:89:0 5...
90568    0 41:74:0 0:77:0 1:80:0 2:83:0 3:86:0 4:89:1 5...
90569    0 41:74:0 0:77:0 1:80:0 2:83:0 3:86:0 4:89:0 5...
Length: 90570, dtype: object

In [25]:
ffm_test_data

0       0 41:74:0 0:77:1 1:80:0 2:83:0 3:86:0 4:89:1 5...
1       0 41:74:0 0:77:1 1:80:0 2:83:0 3:86:0 4:89:1 5...
2       0 41:74:0 0:77:1 1:80:0 2:83:0 3:86:0 4:89:1 5...
3       1 41:74:0 0:77:1 1:80:0 2:83:0 3:86:0 4:89:1 5...
4       0 41:74:0 0:77:1 1:80:0 2:83:0 3:86:0 4:89:1 5...
                              ...                        
9425    0 41:74:0 0:77:0 1:80:0 2:83:0 3:86:0 4:89:0 5...
9426    0 41:74:0 0:77:0 1:80:0 2:83:0 3:86:0 4:89:0 5...
9427    0 41:74:0 0:77:0 1:80:0 2:83:0 3:86:0 4:89:0 5...
9428    0 41:74:0 0:77:0 1:80:0 2:83:0 3:86:0 4:89:1 5...
9429    0 41:74:0 0:77:0 1:80:0 2:83:0 3:86:0 4:89:0 5...
Length: 9430, dtype: object

In [26]:
ffm_train_data.to_csv("data/data_ffm_training.txt", index=False, header=None, sep=',')
ffm_test_data.to_csv("data/data_ffm_test.txt", index=False, header=None, sep=',')

In [28]:
# Training task
ffm_model = xl.create_ffm() # Use field-aware factorization machine
ffm_model.setTrain("data/data_ffm_training.txt")  # Training data
ffm_model.setValidate("data/data_ffm_test.txt")  # Validation data

# param:
#  0. Binary classification task
#  1. learning rate: 0.2
#  2. lambda: 0.002
#  3. metric: accuracy
# param = {'task':'binary', 'lr':0.2, 'lambda':0.002, 'metric': 'acc'}
# param = {'task':'binary', 'lr':0.2, 'lambda':0.002, 'metric': 'prec'}
param = {'task':'binary', 'lr':0.2, 'lambda':0.02, 'metric': 'f1', 'k':8, 'opt':'sgd', 'epoch':100, 'stop_window':10}
# param = {'task':'binary', 'lr':0.2, 'lambda':0.002, 'metric': 'auc'}

# Start to train
# The trained model will be stored in model.out
ffm_model.fit(param, 'model/ffm_model.out')

# Prediction task
ffm_model.setTest("data/data_ffm_test.txt")  # Test data
# fm_model.setSign()     # Convert prediction to 0 and 1
ffm_model.setSigmoid() 

# Start to predict
# The output result will be stored in output.txt
ffm_model.predict("model/ffm_model.out", "model/ffm_output.txt")

KeyboardInterrupt: 