# LightFM model for influence marketing

Description

### Model theoretical explanation
This model is based in ...

### 1. Import Libraries

In [766]:
# Install all the libraries in requirements.txt
import sys
import os

import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scrapbook as sb
import requests
import io

import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation

# Import LightFM's evaluation metrics
from lightfm.evaluation import precision_at_k as lightfm_prec_at_k
from lightfm.evaluation import recall_at_k as lightfm_recall_at_k

# Import repo's evaluation metrics
from recommenders.evaluation.python_evaluation import precision_at_k, recall_at_k

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.models.lightfm.lightfm_utils import (
    track_model_metrics, prepare_test_df, prepare_all_predictions,
    compare_metric, similar_users, similar_items)

print("System version: {}".format(sys.version))
print("LightFM version: {}".format(lightfm.__version__))

System version: 3.8.7 (tags/v3.8.7:6503f05, Dec 21 2020, 17:59:51) [MSC v.1928 64 bit (AMD64)]
LightFM version: 1.16


### 2. Defining Variables

In [767]:
# default number of recommendations
K = 10
# percentage of data used for testing
TEST_PERCENTAGE = 0.25
# model learning rate
LEARNING_RATE = 0.25
# no of latent factors
NO_COMPONENTS = 20
# no of epochs to fit model
NO_EPOCHS = 20
# no of threads to fit model
NO_THREADS = 32
# regularisation for both user and item features
ITEM_ALPHA = 1e-6
USER_ALPHA = 1e-6

# seed for pseudonumber generations
SEED = 42

### 3. Retrive Data

In [768]:
df_co = pd.read_csv('df_co.csv')
df_co

Unnamed: 0,CompanyID,Name_co,Category_co,Hashtags_co,Country_co,Followers_co
0,0,The Little Shop,Cars,#instadaily#food#followback#instatravel,Brazil,195608
1,1,The Cozy Kitchen,Sports with a ball,#instalike#fashion#holidayseason#skiing#loveit,Egypt,415795
2,2,The Roost,Art,#blessings#foodlife#endurance#fitnessmotivatio...,Turkey,116164
3,3,The Wooden Spoon,Computers,#foodislifee#familytime,Iran,524384
4,4,Sunflower Fields,Management,#health#trendy,Germany,307127
...,...,...,...,...,...,...
95,95,Copper Kettle,Undefined,#baseball#friendshipgoals#skiing,Thailand,354362
96,96,The Crusty Baguette,Food,#instamood#education#beautifuldestinations,China,74286
97,97,The Plaid Pail,Luxury,#goodmorning#cricket#transportation#media#makeup,Nigeria,108265
98,98,The Sugar House,Business,#muscle#lifestyle#instatravel#instadaily,Germany,82077


In [769]:
df_influ = pd.read_csv('df_inf.csv')
df_influ

Unnamed: 0,AccountID,Account,Link,Followers,Audience Country,Authentic engagement,Engagement avg,Category1,Hashtags,Cost Story,Cost Post
0,1,cristiano,https://www.instagram.com/cristiano/,400100000.0,India,7800000.0,9500000.0,Sports with a ball,#football#entrepreneur#foodstylist#love#travel...,468000.0,1092000.0
1,2,kyliejenner,https://www.instagram.com/kyliejenner/,308800000.0,United States,6200000.0,10100000.0,Fashion,#blessed#fashionstyle#foodexperience#quotes#fo...,372000.0,868000.0
2,3,leomessi,https://www.instagram.com/leomessi/,306300000.0,Argentina,4800000.0,6500000.0,Sports with a ball,#marathon#foodlover#fashionblogger#motivation#...,288000.0,672000.0
3,4,kendalljenner,https://www.instagram.com/kendalljenner/,217800000.0,United States,3400000.0,5400000.0,Modeling,#food#fitlife#fitnessmodel#vacation#familytime...,204000.0,476000.0
4,5,selenagomez,https://www.instagram.com/selenagomez/,295800000.0,United States,2700000.0,3600000.0,Music,#blackandwhite#fashionblogger#fashion#holidays...,162000.0,378000.0
...,...,...,...,...,...,...,...,...,...,...,...
995,996,senoritasaeva,https://www.instagram.com/senoritasaeva/,7700000.0,Russia,246600.0,318200.0,Lifestyle,#sunset#likeforlike#instagood,14796.0,34524.0
996,997,manuelneuer,https://www.instagram.com/manuelneuer/,11500000.0,Germany,146500.0,210200.0,Sports with a ball,#summer2022#cricket#sundayfunday#beautifulday#...,8790.0,20510.0
997,998,sahilkhan,https://www.instagram.com/sahilkhan/,10100000.0,India,176500.0,239800.0,Fitness,#fitlife#tbt#fashion#fashionable#smilemore#str...,10590.0,24710.0
998,999,mohanshakti,https://www.instagram.com/mohanshakti/,13700000.0,India,146400.0,175500.0,Art,#foodlife#colorful#fit#lifeisgood#telecom#fami...,8784.0,20496.0


### 4. Prepare Data

In [770]:
df2_co = pd.concat([df_co]*1000, ignore_index=True)

In [771]:
df2_influ = pd.DataFrame(np.repeat(df_influ.values, 100, axis=0), columns=["AccountID", "Account", "Link", "Followers", "Audience Country", "Authentic engagement", "Engagement avg", "Category1", "Hashtags", "Cost Story", "Cost Post"])


In [772]:
df = pd.concat([df2_influ, df2_co], axis=1)

In [773]:
def Convert(string):
    li = list(string.split("#"))
    return li

In [774]:
df['Hashtags'] = df['Hashtags'].astype(str)

In [775]:
h = []
h_co = []

for i in range(len(df['Hashtags'])):
    h.append(Convert(df['Hashtags'][i]))
    h_co.append(Convert(df['Hashtags_co'][i]))

df['Hashtags'] = h
df['Hashtags_co'] = h_co

In [776]:
for i in range(len(df['Hashtags'])):
    df['Hashtags'][i].pop(0)
    df['Hashtags_co'][i].pop(0)

In [777]:
df

Unnamed: 0,AccountID,Account,Link,Followers,Audience Country,Authentic engagement,Engagement avg,Category1,Hashtags,Cost Story,Cost Post,CompanyID,Name_co,Category_co,Hashtags_co,Country_co,Followers_co
0,1,cristiano,https://www.instagram.com/cristiano/,400100000.0,India,7800000.0,9500000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,0,The Little Shop,Cars,"[instadaily, food, followback, instatravel]",Brazil,195608
1,1,cristiano,https://www.instagram.com/cristiano/,400100000.0,India,7800000.0,9500000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,1,The Cozy Kitchen,Sports with a ball,"[instalike, fashion, holidayseason, skiing, lo...",Egypt,415795
2,1,cristiano,https://www.instagram.com/cristiano/,400100000.0,India,7800000.0,9500000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,2,The Roost,Art,"[blessings, foodlife, endurance, fitnessmotiva...",Turkey,116164
3,1,cristiano,https://www.instagram.com/cristiano/,400100000.0,India,7800000.0,9500000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,3,The Wooden Spoon,Computers,"[foodislifee, familytime]",Iran,524384
4,1,cristiano,https://www.instagram.com/cristiano/,400100000.0,India,7800000.0,9500000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,4,Sunflower Fields,Management,"[health, trendy]",Germany,307127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000.0,Mexico,305600.0,391900.0,Lifestyle,[photo],18336.0,42784.0,95,Copper Kettle,Undefined,"[baseball, friendshipgoals, skiing]",Thailand,354362
99996,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000.0,Mexico,305600.0,391900.0,Lifestyle,[photo],18336.0,42784.0,96,The Crusty Baguette,Food,"[instamood, education, beautifuldestinations]",China,74286
99997,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000.0,Mexico,305600.0,391900.0,Lifestyle,[photo],18336.0,42784.0,97,The Plaid Pail,Luxury,"[goodmorning, cricket, transportation, media, ...",Nigeria,108265
99998,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000.0,Mexico,305600.0,391900.0,Lifestyle,[photo],18336.0,42784.0,98,The Sugar House,Business,"[muscle, lifestyle, instatravel, instadaily]",Germany,82077


In [778]:
num_coincidente = df.apply(lambda row: len(set(row['Hashtags']).intersection(set(row['Hashtags_co']))), axis=1)
points_cat = np.where((df['Category1'] == df['Category_co']), 10, 0)
points_country = np.where((df['Audience Country'] == df['Country_co']), 5, 0)
points_eng = df['Authentic engagement']/df['Followers']
df['num_coincidentes'] = num_coincidente
df['points_eng'] = points_eng
df['points_cat'] = points_cat
df['points_country'] = points_country
df['Puntos'] = df['points_cat'] + df['points_country'] + df["num_coincidentes"] + df['points_eng']

df = df.drop(['points_cat', 'points_country', 'num_coincidentes', 'Engagement avg', 'points_eng'], axis=1)


In [779]:
df.dtypes

AccountID               object
Account                 object
Link                    object
Followers               object
Audience Country        object
Authentic engagement    object
Category1               object
Hashtags                object
Cost Story              object
Cost Post               object
CompanyID                int64
Name_co                 object
Category_co             object
Hashtags_co             object
Country_co              object
Followers_co             int64
Puntos                  object
dtype: object

In [780]:
df['Followers'] = df['Followers'].astype(int)
df['Authentic engagement'] = df['Authentic engagement'].astype(float)
df['Cost Story'] = df['Cost Story'].astype(float)
df['Followers_co'] = df['Followers_co'].astype(int)
df['Puntos'] = df['Puntos'].astype(float)
df.dtypes

AccountID                object
Account                  object
Link                     object
Followers                 int32
Audience Country         object
Authentic engagement    float64
Category1                object
Hashtags                 object
Cost Story              float64
Cost Post                object
CompanyID                 int64
Name_co                  object
Category_co              object
Hashtags_co              object
Country_co               object
Followers_co              int32
Puntos                  float64
dtype: object

In [781]:
# Change the name of columns to fit the model.
df = df.rename(columns={"AccountID": "itemID", "CompanyID": "userID", "Puntos": "rating"})

In [782]:
df['Audience Country'] = df['Audience Country'].fillna('Unknown')

In [783]:
df

Unnamed: 0,itemID,Account,Link,Followers,Audience Country,Authentic engagement,Category1,Hashtags,Cost Story,Cost Post,userID,Name_co,Category_co,Hashtags_co,Country_co,Followers_co,rating
0,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,0,The Little Shop,Cars,"[instadaily, food, followback, instatravel]",Brazil,195608,0.019495
1,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,1,The Cozy Kitchen,Sports with a ball,"[instalike, fashion, holidayseason, skiing, lo...",Egypt,415795,10.019495
2,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,2,The Roost,Art,"[blessings, foodlife, endurance, fitnessmotiva...",Turkey,116164,0.019495
3,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,3,The Wooden Spoon,Computers,"[foodislifee, familytime]",Iran,524384,0.019495
4,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,4,Sunflower Fields,Management,"[health, trendy]",Germany,307127,0.019495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000,Mexico,305600.0,Lifestyle,[photo],18336.0,42784.0,95,Copper Kettle,Undefined,"[baseball, friendshipgoals, skiing]",Thailand,354362,0.049290
99996,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000,Mexico,305600.0,Lifestyle,[photo],18336.0,42784.0,96,The Crusty Baguette,Food,"[instamood, education, beautifuldestinations]",China,74286,0.049290
99997,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000,Mexico,305600.0,Lifestyle,[photo],18336.0,42784.0,97,The Plaid Pail,Luxury,"[goodmorning, cricket, transportation, media, ...",Nigeria,108265,0.049290
99998,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000,Mexico,305600.0,Lifestyle,[photo],18336.0,42784.0,98,The Sugar House,Business,"[muscle, lifestyle, instatravel, instadaily]",Germany,82077,0.049290


In [784]:
remove_n = 3246
drop_indices = np.random.choice(df.index, remove_n, replace=False)
df = df.drop(drop_indices)

In [785]:
new_row = {'itemID':1001, 'Account':'jionast', 'Link':'https://www.instagram.com/jionast/', 'Followers':173571, 'Audience Country':'Spain', 'Authentic engagement':113456, 'Category1':'Lifestyle', 'Hashtags':['football', 'entrepreneur'], 'Cost Story':1111, 'Cost Post':1233, 'userID':101, 'Name_co':'Mcdonalds', 'Category_co':'Food', 'Hashtags_co':['football', 'food'], 'Country_co':'Spain', 'Followers_co':123445, 'rating':3}
df = df.append(new_row, ignore_index=True)

  df = df.append(new_row, ignore_index=True)


In [786]:
df

Unnamed: 0,itemID,Account,Link,Followers,Audience Country,Authentic engagement,Category1,Hashtags,Cost Story,Cost Post,userID,Name_co,Category_co,Hashtags_co,Country_co,Followers_co,rating
0,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,0,The Little Shop,Cars,"[instadaily, food, followback, instatravel]",Brazil,195608,0.019495
1,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,1,The Cozy Kitchen,Sports with a ball,"[instalike, fashion, holidayseason, skiing, lo...",Egypt,415795,10.019495
2,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,2,The Roost,Art,"[blessings, foodlife, endurance, fitnessmotiva...",Turkey,116164,0.019495
3,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,3,The Wooden Spoon,Computers,"[foodislifee, familytime]",Iran,524384,0.019495
4,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,4,Sunflower Fields,Management,"[health, trendy]",Germany,307127,0.019495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96750,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000,Mexico,305600.0,Lifestyle,[photo],18336.0,42784.0,96,The Crusty Baguette,Food,"[instamood, education, beautifuldestinations]",China,74286,0.049290
96751,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000,Mexico,305600.0,Lifestyle,[photo],18336.0,42784.0,97,The Plaid Pail,Luxury,"[goodmorning, cricket, transportation, media, ...",Nigeria,108265,0.049290
96752,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000,Mexico,305600.0,Lifestyle,[photo],18336.0,42784.0,98,The Sugar House,Business,"[muscle, lifestyle, instatravel, instadaily]",Germany,82077,0.049290
96753,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000,Mexico,305600.0,Lifestyle,[photo],18336.0,42784.0,99,Urban Bites,Shows,"[cricket, lifeisgood]",Tanzania,445319,0.049290


Before fitting the LightFM model, we need to create an instance of Dataset which holds the interaction matrix.

In [787]:
dataset = Dataset()

The fit method creates the user(companies)/item(influencers) mappings.

In [599]:
dataset.fit(users=df['userID'], 
            items=df['itemID'])

# quick check to determine the number of unique users and items in the data
num_users, num_topics = dataset.interactions_shape()
print(f'Num users: {num_users}, num_topics: {num_topics}.')

Num users: 100, num_topics: 1000.


Next is to build the interaction matrix. The build_interactions method returns 2 COO sparse matrices, namely the interactions and weights matrices.

In [600]:
(interactions, weights) = dataset.build_interactions(df.iloc[:, [10,0,16]].values)

We will use cross_validation.random_train_test_split method to split the interaction data and splits it into two disjoint training and test sets.

In [601]:
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(SEED))

Double check the size of both the train and test sets.

In [602]:
print(f"Shape of train interactions: {train_interactions.shape}")
print(f"Shape of test interactions: {test_interactions.shape}")

Shape of train interactions: (100, 1000)
Shape of test interactions: (100, 1000)


### 5. Fit the LightFM model

The LightFM model will be using the weighted Approximate-Rank Pairwise (WARP) as the loss. It maximises the rank of positive examples by repeatedly sampling negative examples until a rank violation has been located. This approach is recommended when only positive interactions are present, as in our case.

In [603]:
model1 = LightFM(loss='warp', no_components=NO_COMPONENTS, 
                 learning_rate=LEARNING_RATE,                 
                 random_state=np.random.RandomState(SEED))

The LightFM model can be fitted with the following code:

In [604]:
model1.fit(interactions=train_interactions,
          epochs=NO_EPOCHS);

### 6. Prepare model evaluation data

Before we can evaluate the fitted model and to get the data into a format which is compatible with the existing evaluation methods within this repo, the data needs to be treated slightly.

First the train/test indices need to be extracted from the lightfm.cross_validation method as follows:

In [605]:
uids, iids, interaction_data = cross_validation._shuffle(
    interactions.row, interactions.col, interactions.data, 
    random_state=np.random.RandomState(SEED))

cutoff = int((1.0 - TEST_PERCENTAGE) * len(uids))
test_idx = slice(cutoff, None)

Then the the mapping between internal and external representation of the user and item are extracted as follows:

In [606]:
uid_map, ufeature_map, iid_map, ifeature_map = dataset.mapping()

Once the train/test indices and mapping are ready, the test dataframe can be constructed as follows:

In [607]:
with Timer() as test_time:
    test_df = prepare_test_df(test_idx, uids, iids, uid_map, iid_map, weights)
print(f"Took {test_time.interval:.1f} seconds for prepare and predict test data.")  
time_reco1 = test_time.interval

Took 3.7 seconds for prepare and predict test data.


And samples of the test dataframe:

In [608]:
test_df.sample(5, random_state=SEED)

Unnamed: 0,userID,itemID,rating
6868,84,71,0.011572
24016,69,261,0.099099
9668,25,330,0.045171
13640,3,478,0.026713
14018,72,264,0.017619


In addition, the predictions of all unseen user-item pairs (e.g. removing those seen in the training data) can be prepared as follows:

In [609]:
with Timer() as test_time:
    all_predictions = prepare_all_predictions(df, uid_map, iid_map, 
                                              interactions=train_interactions,
                                              model=model1, 
                                              num_threads=NO_THREADS)
print(f"Took {test_time.interval:.1f} seconds for prepare and predict all data.")
time_reco2 = test_time.interval

Took 20.6 seconds for prepare and predict all data.


Samples of the all_predictions dataframe:

In [610]:
all_predictions.sample(5, random_state=SEED)

Unnamed: 0,userID,itemID,prediction
6868,27,47,-50.41325
24016,96,91,-49.483288
9668,38,29,-46.989388
13640,54,319,-45.82608
14018,55,774,-47.910198


The raw prediction values from the LightFM model are for ranking purposes only, they should not be used directly. The magnitude and sign of these values do not have any specific interpretation.

### 7. Model evaluation

Once the evaluation data are ready, they can be passed into to the repo's evaluation methods as follows. The performance of the model will be tracked using both Precision@K and Recall@K.

In addition, the results have also being compared with those computed from LightFM's own evaluation methods to ensure accuracy.

In [611]:
with Timer() as test_time:
    eval_precision = precision_at_k(rating_true=test_df, 
                                rating_pred=all_predictions, k=K)
    eval_recall = recall_at_k(test_df, all_predictions, k=K)
time_reco3 = test_time.interval

with Timer() as test_time:
    eval_precision_lfm = lightfm_prec_at_k(model1, test_interactions, 
                                           train_interactions, k=K).mean()
    eval_recall_lfm = lightfm_recall_at_k(model1, test_interactions, 
                                          train_interactions, k=K).mean()
time_lfm = test_time.interval
    
print(
    "------ Using Repo's evaluation methods ------",
    f"Precision@K:\t{eval_precision:.6f}",
    f"Recall@K:\t{eval_recall:.6f}",
    "\n------ Using LightFM evaluation methods ------",
    f"Precision@K:\t{eval_precision_lfm:.6f}",
    f"Recall@K:\t{eval_recall_lfm:.6f}", 
    sep='\n')

------ Using Repo's evaluation methods ------
Precision@K:	1.000000
Recall@K:	0.040104

------ Using LightFM evaluation methods ------
Precision@K:	1.000000
Recall@K:	0.040104


### Using also explicit feedbacks and additional item and user features.

As the LightFM was designed to incorporates both user and item metadata, the model can be extended to include additional features such as movie genres and user occupations.

### 8. Extract and prepare item features.

The influencers' hashtags will be used as the item metadata.

In [612]:
all_hashtags = []
for x in df['Hashtags']:
  all_hashtags.append(x)

all_hashtags = sorted(list(set(itertools.chain.from_iterable(all_hashtags))))
all_hashtags

['amazing',
 'art',
 'athlete',
 'baby',
 'baseball',
 'basketball',
 'beach',
 'beautiful',
 'beautifulday',
 'beautifuldestinations',
 'beauty',
 'beautyblogger',
 'beautyful',
 'bestoftheday',
 'blackandwhite',
 'blessed',
 'blessings',
 'blogger',
 'boxing',
 'cardio',
 'cars',
 'coffee',
 'colorful',
 'construction',
 'cool',
 'couplesgoals',
 'cricket',
 'crossfit',
 'cute',
 'cycling',
 'dog',
 'doglover',
 'doglovers',
 'dogsofinstagram',
 'dogstagram',
 'education',
 'endurance',
 'energy',
 'entertainment',
 'entrepreneur',
 'extremesports',
 'family',
 'familytime',
 'fashion',
 'fashionable',
 'fashionblogger',
 'fashionista',
 'fashionphotography',
 'fashionstyle',
 'fit',
 'fitfam',
 'fitlife',
 'fitness',
 'fitnessmodel',
 'fitnessmotivation',
 'fitspo',
 'flex',
 'follow',
 'follow4follow',
 'followback',
 'followme',
 'food',
 'foodaddict',
 'foodblogfeed',
 'foodbloggerlife',
 'foodbloggersofinstagram',
 'foodcoma',
 'foodexperience',
 'foodexplorer',
 'foodfeed',
 'f

In [613]:
all_categories = sorted(list(set(df['Category1'])))
all_categories

['Accessories',
 'Adult content',
 'Animals',
 'Art',
 'Beauty',
 'Business',
 'Cars',
 'Cinema',
 'Clothing',
 'Computers',
 'Extreme',
 'Family',
 'Fashion',
 'Finance',
 'Fitness',
 'Food',
 'Humor',
 'Kids',
 'Lifestyle',
 'Literature',
 'Luxury',
 'Machinery',
 'Management',
 'Modeling',
 'Music',
 'Nature  landscapes',
 'Photography',
 'Racing',
 'Science',
 'Shows',
 'Sports with a ball',
 'Undefined']

In [614]:
all_countries = sorted(list(set(df['Audience Country'])))
all_countries

['Algeria',
 'Argentina',
 'Brazil',
 'Chile',
 'China',
 'Colombia',
 'Egypt',
 'France',
 'Germany',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Italy',
 'Japan',
 'Kazakhstan',
 'Mexico',
 'Morocco',
 'Nigeria',
 'Philippines',
 'Poland',
 'Russia',
 'Saudi Arabia',
 'South Korea',
 'Spain',
 'Syria',
 'Thailand',
 'Turkey',
 'United Arab Emirates',
 'United Kingdom',
 'United States',
 'Unknown']

### 8. Extract and prepare user features.

In [615]:
all_hashtagsCo = []
for x in df['Hashtags_co']:
  all_hashtagsCo.append(x)

all_hashtagsCo = sorted(list(set(itertools.chain.from_iterable(all_hashtagsCo))))
all_hashtagsCo

['amazing',
 'athlete',
 'baby',
 'baseball',
 'basketball',
 'beach',
 'beautiful',
 'beautifulday',
 'beautifuldestinations',
 'beauty',
 'beautyblogger',
 'beautyful',
 'blackandwhite',
 'blessings',
 'blogger',
 'boxing',
 'cardio',
 'cars',
 'coffee',
 'colorful',
 'construction',
 'couplesgoals',
 'cricket',
 'crossfit',
 'cute',
 'cycling',
 'dog',
 'doglover',
 'dogsofinstagram',
 'dogstagram',
 'education',
 'endurance',
 'energy',
 'entertainment',
 'entrepreneur',
 'extremesports',
 'familytime',
 'fashion',
 'fashionable',
 'fashionblogger',
 'fashionphotography',
 'fashionstyle',
 'fitfam',
 'fitlife',
 'fitness',
 'fitnessmodel',
 'fitnessmotivation',
 'fitspo',
 'follow',
 'follow4follow',
 'followback',
 'followme',
 'food',
 'foodaddict',
 'foodblogfeed',
 'foodbloggerlife',
 'foodbloggersofinstagram',
 'foodcoma',
 'foodexperience',
 'foodexplorer',
 'foodgawker',
 'foodgrammer',
 'foodheaven',
 'foodie',
 'foodieforlife',
 'foodinspo',
 'foodisart',
 'foodislifee',
 

The companies' categories will be used as the user metadata.

In [616]:
all_categoriesCo = sorted(list(set(df['Category_co'])))
all_categoriesCo

['Accessories',
 'Adult content',
 'Art',
 'Beauty',
 'Business',
 'Cars',
 'Cinema',
 'Clothing',
 'Computers',
 'Family',
 'Fashion',
 'Finance',
 'Fitness',
 'Food',
 'Humor',
 'Kids',
 'Lifestyle',
 'Luxury',
 'Machinery',
 'Management',
 'Modeling',
 'Music',
 'Nature  landscapes',
 'Photography',
 'Racing',
 'Science',
 'Shows',
 'Sports with a ball',
 'Undefined']

In [617]:
all_countriesCo = sorted(list(set(df['Country_co'])))
all_countriesCo

['Bangladesh',
 'Brazil',
 'China',
 'Colombia',
 'DR Congo',
 'Egypt',
 'Ethiopia',
 'France',
 'Germany',
 'India',
 'Indonesia',
 'Iran',
 'Italy',
 'Japan',
 'Kenya',
 'Mexico',
 'Myanmar',
 'Nigeria',
 'Pakistan',
 'Philippines',
 'Russia',
 'South Africa',
 'South Korea',
 'Spain',
 'Tanzania',
 'Thailand',
 'Turkey',
 'United Kingdom',
 'United States',
 'Vietnam']

The data is required to be converted into a Dataset instance and then create a user/item id mapping with the fit method.

In [636]:
dataset2 = Dataset()
dataset2.fit(df['userID'], 
            df['itemID'], 
            item_features=all_hashtags,
            user_features=all_categoriesCo)

The hashtags are then converted into a item feature matrix using the build_item_features method as follows:

In [620]:
item_features = dataset2.build_item_features((x, y) for x,y in zip(df.itemID, df.Hashtags))

The user occupations are then converted into an user feature matrix using the build_user_features method as follows:

In [621]:
user_features = dataset2.build_user_features((x, [y]) for x,y in zip(df.userID, df['Category_co']))

Once the item and user features matrices have been completed, we build the interaction matrix and split the interactions into train and test sets as follows:

In [622]:
interactions2, weights2 = dataset2.build_interactions(df.iloc[:, [10,0,16]].values)

train_interactions2, test_interactions2 = cross_validation.random_train_test_split(
    interactions2, 
    test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(SEED)
)

### 9. Fit the LightFM model with additional user and item features

In [623]:
model2 = LightFM(loss='warp', no_components=NO_COMPONENTS, 
                 learning_rate=LEARNING_RATE, 
                 item_alpha=ITEM_ALPHA,
                 user_alpha=USER_ALPHA,
                 random_state=np.random.RandomState(SEED)
                )

In [624]:
model2.fit(interactions=train_interactions2,
           user_features=user_features,
           item_features=item_features,
           epochs=NO_EPOCHS
           )

<lightfm.lightfm.LightFM at 0x28d8fd9ff10>

### 10. Prepare model evaluation data

The evaluation data needs to be prepared in order to get them into a format consumable with this repo's evaluation methods. Firstly the train/test indices and id mappings are extracted using the new interations matrix as follows:

In [625]:
uids, iids, interaction_data = cross_validation._shuffle(
    interactions2.row, 
    interactions2.col, 
    interactions2.data, 
    random_state=np.random.RandomState(SEED)
)

uid_map, ufeature_map, iid_map, ifeature_map = dataset2.mapping()

The test dataframe is then constructed as follows:

In [626]:
with Timer() as test_time:
    test_df2 = prepare_test_df(test_idx, uids, iids, uid_map, iid_map, weights2)
print(f"Took {test_time.interval:.1f} seconds for prepare and predict test data.") 

Took 3.1 seconds for prepare and predict test data.


The predictions of all unseen user-item pairs can be prepared as follows:

In [628]:
with Timer() as test_time:
    all_predictions2 = prepare_all_predictions(df, uid_map, iid_map, 
                                              interactions=train_interactions2,
                                               user_features=user_features,
                                               item_features=item_features,
                                               model=model2,
                                               num_threads=NO_THREADS)

print(f"Took {test_time.interval:.1f} seconds for prepare and predict all data.")

Took 10.2 seconds for prepare and predict all data.


### 11. Model evaluation and comparison

The predictive performance of the new model can be computed and compared with the previous model (which used only the explicit rating) as follows:

In [629]:
eval_precision2 = precision_at_k(rating_true=test_df2, 
                                rating_pred=all_predictions2, k=K)
eval_recall2 = recall_at_k(test_df2, all_predictions2, k=K)

print(
    "------ Using only explicit ratings ------",
    f"Precision@K:\t{eval_precision:.6f}",
    f"Recall@K:\t{eval_recall:.6f}",
    "\n------ Using both implicit and explicit ratings ------",
    f"Precision@K:\t{eval_precision2:.6f}",
    f"Recall@K:\t{eval_recall2:.6f}",
    sep='\n')

------ Using only explicit ratings ------
Precision@K:	1.000000
Recall@K:	0.040104

------ Using both implicit and explicit ratings ------
Precision@K:	1.000000
Recall@K:	0.040104


### Similar users and items

As the LightFM package operates based on latent embeddings, these can be retrieved once the model has been fitted to assess user-user and/or item-item affinity.

#### User affinity

The user-user affinity can be retrieved with the get_user_representations method from the fitted model as follows:

In [630]:
_, user_embeddings = model2.get_user_representations(features=user_features)
user_embeddings

array([[-0.45531923, -0.43289188, -0.03967326, ...,  0.01572942,
        -0.5512873 ,  0.5807483 ],
       [-0.274342  ,  0.34890315, -0.45042866, ...,  0.2794578 ,
         0.65751046,  0.09403343],
       [ 0.94743156, -0.8897798 ,  0.23471047, ..., -0.5223647 ,
        -0.1830869 ,  0.41828644],
       ...,
       [-0.11428593,  0.2650502 ,  0.14150776, ...,  0.7554304 ,
         0.29907694,  1.2574228 ],
       [ 0.15544437, -0.24788855,  0.29809886, ...,  0.5129658 ,
        -0.38094768, -0.5596343 ],
       [-0.57772243, -0.54959005,  0.7289521 , ..., -0.535204  ,
        -0.49054462, -0.34571618]], dtype=float32)

In order to retrieve the top N similar users, we can use the similar_users from recommenders. For example, if we want to choose top 10 users most similar to the user 1:

In [631]:
similar_users(user_id=1, 
              user_features=user_features, 
              model=model2)

Unnamed: 0,userID,score
0,38,0.999999
1,35,0.999999
2,78,0.999999
3,97,0.131179
4,40,0.130995
5,16,0.130821
6,45,0.078236
7,41,0.078094
8,93,0.078038
9,18,0.07801


In [644]:
df[df['userID']==38].max()

itemID                                                               1000
Account                                                           zoesugg
Link                                   https://www.instagram.com/zoesugg/
Followers                                                       469600000
Audience Country                                                  Unknown
Authentic engagement                                           13400000.0
Category1                                                       Undefined
Hashtags                [yum, styleblogger, holidayvibes, trainhard, c...
Cost Story                                                       804000.0
Cost Post                                                       1876000.0
userID                                                                 38
Name_co                                                     Garden Grille
Category_co                                            Sports with a ball
Hashtags_co                           

#### Item affinity

The item-item affinity can be retrieved with the get_item_representations method using the fitted model.

In [639]:
similar_items(item_id=10, 
              item_features=item_features, 
              model=model2)

Unnamed: 0,itemID,score
0,498,0.765805
1,297,0.667022
2,537,0.656052
3,931,0.6519
4,52,0.636744
5,585,0.634449
6,334,0.601899
7,795,0.594444
8,339,0.591483
9,624,0.578858
