## Libraries Installation

In [None]:
%pip install pandas spacy networkx matplotlib scipy lightfm

## Libraries Import

In [3]:
import os
import random
import numpy as np
import pandas as pd
import lightfm
import warnings

from lightfm import LightFM, cross_validation
from lightfm.evaluation import precision_at_k, auc_score
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

warnings.simplefilter(action='ignore', category=FutureWarning)

### Read Data

In [7]:
df_playlist = pd.read_csv('./sample/post/view_data.csv', error_bad_lines=False, warn_bad_lines=False, skiprows=lambda i: i > 0 and random.random() > 0.50)
df_playlist = df_playlist.drop('time_stamp', axis=1)
df_playlist

Unnamed: 0,user_id,post_id
0,5eece14efc13ae660900003c,43094523
1,5eece14ffc13ae66090001d4,76472880
2,5eece14ffc13ae66090001bd,104702447
3,5eece14ffc13ae660900012c,387648862
4,5eece14ffc13ae660900018c,618411064
...,...,...
35959,5eece14ffc13ae66090000a8,40799413
35960,5eece14ffc13ae6609000192,469745748
35961,5eece14ffc13ae660900018c,615389604
35962,5eece14ffc13ae6609000190,619052165


In [8]:
df_playlist['user_id'].value_counts()
df_playlist['post_id'].value_counts()

408118198    16
512471515    16
440573675    15
401589099    15
517475247    15
             ..
56800053      1
840807769     1
730038714     1
994466500     1
653075852     1
Name: post_id, Length: 5980, dtype: int64

In [9]:
df_playlist['rating'] = 1
df_rating = df_playlist.groupby('post_id', as_index=False).sum()
post_id = df_rating['post_id'].tolist()
rating = df_rating['rating'].tolist()
rating_mapping = dict(zip(post_id, rating))
df_playlist['rating'] = df_playlist['post_id'].map(rating_mapping)
display(df_playlist)

Unnamed: 0,user_id,post_id,rating
0,5eece14efc13ae660900003c,43094523,10
1,5eece14ffc13ae66090001d4,76472880,7
2,5eece14ffc13ae66090001bd,104702447,9
3,5eece14ffc13ae660900012c,387648862,4
4,5eece14ffc13ae660900018c,618411064,6
...,...,...,...
35959,5eece14ffc13ae66090000a8,40799413,6
35960,5eece14ffc13ae6609000192,469745748,7
35961,5eece14ffc13ae660900018c,615389604,6
35962,5eece14ffc13ae6609000190,619052165,5


### Data Preprocessing

In [10]:
df_playlist = df_playlist.groupby('post_id').filter(lambda x: len(x) >= 10)
df_playlist

Unnamed: 0,user_id,post_id,rating
0,5eece14efc13ae660900003c,43094523,10
12,5eece14efc13ae660900001f,293468411,10
14,5eece14efc13ae6609000017,660867347,10
16,5eece14ffc13ae66090000ef,285046033,11
23,5eece14efc13ae6609000024,140572936,10
...,...,...,...
35929,5eece14efc13ae660900000d,762752770,10
35944,5eece14ffc13ae660900011c,92186453,10
35951,5eece14ffc13ae6609000118,135453406,10
35954,5eece14efc13ae660900004b,38831519,12


In [11]:
df_playlist = df_playlist[df_playlist.groupby('user_id').post_id.transform('nunique') >= 10]
df_playlist

Unnamed: 0,user_id,post_id,rating
0,5eece14efc13ae660900003c,43094523,10
12,5eece14efc13ae660900001f,293468411,10
14,5eece14efc13ae6609000017,660867347,10
16,5eece14ffc13ae66090000ef,285046033,11
23,5eece14efc13ae6609000024,140572936,10
...,...,...,...
35906,5eece14ffc13ae6609000152,332043391,12
35929,5eece14efc13ae660900000d,762752770,10
35944,5eece14ffc13ae660900011c,92186453,10
35951,5eece14ffc13ae6609000118,135453406,10


In [12]:
df_title = pd.read_csv('./sample/post/post_data.csv')
titles = df_title['title'].tolist()
post_id = df_title['post_id'].tolist()
normal_mapping = dict(zip(titles, post_id))
reverse_mapping = dict(zip(post_id, titles))

### Define Functions

In [13]:
def create_interaction_matrix(df, user_col, item_col, rating_col, norm=False, threshold=None):
    interactions = df.groupby([user_col, item_col])[rating_col].sum().unstack().reset_index().fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    return interactions

In [14]:
def create_user_dict(interactions):
    user_id = list(interactions.index)
    user_dict = {}
    counter = 0
    for i in user_id:
        user_dict[i] = counter
        counter += 1
    return user_dict

In [15]:
def create_item_dict(df, id_col, name_col):
    item_dict = {}
    for i in range(df.shape[0]):
        item_dict[(df.loc[i, id_col])] = df.loc[i, name_col]
    return item_dict

In [16]:
def runMF(interactions, n_components=30, loss='warp', k=15, epoch=30, n_jobs=4):
    # x = sparse.csr_matrix(interactions.values)
    model = LightFM(loss=loss, k=k, no_components=n_components)
    model = model.fit(x, epochs=epoch, num_threads=n_jobs)
    return model

In [17]:
def sample_recommendation_user(model, interactions, user_id, user_dict, item_dict, threshold=0, nrec_items=10, show=True):
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x, np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))

    known_items = list(pd.Series(interactions.loc[user_id, :][interactions.loc[user_id, :] > threshold].index).sort_values(ascending=False))
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))

    if show:
        print("User {}".format(user_id))
        print("Known Likes:")

        counter = 1
        for i in known_items:
            print("{}- {}".format(counter, i))
            counter += 1

        print("Recommended Items:")
        counter = 1
        for i in scores:
            print("{}- {}".format(counter, i))
            counter += 1

### Create Model Inputs

In [18]:
df_playlist

Unnamed: 0,user_id,post_id,rating
0,5eece14efc13ae660900003c,43094523,10
12,5eece14efc13ae660900001f,293468411,10
14,5eece14efc13ae6609000017,660867347,10
16,5eece14ffc13ae66090000ef,285046033,11
23,5eece14efc13ae6609000024,140572936,10
...,...,...,...
35906,5eece14ffc13ae6609000152,332043391,12
35929,5eece14efc13ae660900000d,762752770,10
35944,5eece14ffc13ae660900011c,92186453,10
35951,5eece14ffc13ae6609000118,135453406,10


In [19]:
interactions = create_interaction_matrix(df=df_playlist, user_col='user_id', item_col='post_id', rating_col='rating', norm=False, threshold=None)
interactions.head()


post_id,10164988,12702125,13211110,15166848,16940159,20696144,20760164,21753916,21851527,23622541,...,976265938,976973324,979521918,983474575,984594176,988703947,990844883,994171620,994506044,995833095
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5eece14efc13ae6609000003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5eece14efc13ae6609000006,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5eece14efc13ae6609000008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5eece14efc13ae660900000a,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5eece14efc13ae660900000c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
user_dict = create_user_dict(interactions=interactions)
user_dict

{'5eece14efc13ae6609000003': 0,
 '5eece14efc13ae6609000006': 1,
 '5eece14efc13ae6609000008': 2,
 '5eece14efc13ae660900000a': 3,
 '5eece14efc13ae660900000c': 4,
 '5eece14efc13ae660900000d': 5,
 '5eece14efc13ae660900000e': 6,
 '5eece14efc13ae660900000f': 7,
 '5eece14efc13ae6609000010': 8,
 '5eece14efc13ae6609000012': 9,
 '5eece14efc13ae6609000014': 10,
 '5eece14efc13ae6609000017': 11,
 '5eece14efc13ae660900001b': 12,
 '5eece14efc13ae660900001c': 13,
 '5eece14efc13ae660900001d': 14,
 '5eece14efc13ae660900001e': 15,
 '5eece14efc13ae660900001f': 16,
 '5eece14efc13ae6609000022': 17,
 '5eece14efc13ae6609000024': 18,
 '5eece14efc13ae6609000025': 19,
 '5eece14efc13ae6609000026': 20,
 '5eece14efc13ae660900002a': 21,
 '5eece14efc13ae660900002b': 22,
 '5eece14efc13ae660900002c': 23,
 '5eece14efc13ae660900002d': 24,
 '5eece14efc13ae660900002e': 25,
 '5eece14efc13ae6609000030': 26,
 '5eece14efc13ae6609000031': 27,
 '5eece14efc13ae6609000032': 28,
 '5eece14efc13ae6609000033': 29,
 '5eece14efc13ae6609

In [21]:
df_playlist

Unnamed: 0,user_id,post_id,rating
0,5eece14efc13ae660900003c,43094523,10
12,5eece14efc13ae660900001f,293468411,10
14,5eece14efc13ae6609000017,660867347,10
16,5eece14ffc13ae66090000ef,285046033,11
23,5eece14efc13ae6609000024,140572936,10
...,...,...,...
35906,5eece14ffc13ae6609000152,332043391,12
35929,5eece14efc13ae660900000d,762752770,10
35944,5eece14ffc13ae660900011c,92186453,10
35951,5eece14ffc13ae6609000118,135453406,10


In [22]:
item_dict = reverse_mapping

In [23]:
x = sparse.csr_matrix(interactions.values)
train, test = lightfm.cross_validation.random_train_test_split(x, test_percentage=0.2, random_state=None)

### Build a Model

In [24]:
%time
model = runMF(interactions=train, n_components=30, loss='warp', k=15, epoch=30, n_jobs=4)

CPU times: total: 0 ns
Wall time: 988 µs


In [25]:
train_auc = auc_score(model, train, num_threads=4).mean()
print('Collaborative filtering train AUC: %s' % train_auc)

Collaborative filtering train AUC: 0.9900784


In [26]:
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

In [27]:
print('train precision: %.2f, test precision %.2f' % (train_precision, test_precision))

train precision: 0.70, test precision 0.31


### Examples

In [28]:
rec_list = sample_recommendation_user(model=model, interactions=interactions, user_id='5eece14ffc13ae66090001ba', user_dict=user_dict, item_dict=item_dict, threshold=0, nrec_items=10, show=True)

User 5eece14ffc13ae66090001ba
Known Likes:
1-  Winning Tactics For DANCE
2-  Sick And Tired Of Doing GST The Old Way? Read This
3-  Never Changing GST Will Eventually Destroy You
4-  Is ART Worth [$] To You?
5-  How To Start A Business With FASHION DESIGN
6-  Secrets To SCIENCE – Even In This Down Economy
7- HR MANAGEMENT: Do You Really Need It? This Will Help You Decide!
8-  10 Funny PROGRAMMING Quotes
9-  How To Make Your PAINTING Look Amazing In 5 Days
10-  3 Ways To Master FASHION DESIGN Without Breaking A Sweat
11-  Why You Never See PROGRAMMING That Actually Works
12-  Quick and Easy Fix For Your POLITICAL
13-  Take 10 Minutes to Get Started With HR MANAGEMENT
14- CRAFT And The Chuck Norris Effect
15-  The Ultimate Deal On PAINTING
Recommended Items:
1-  Fascinating SCIENCE Tactics That Can Help Your Business Grow
2-  Here Is What You Should Do For Your HR MANAGEMENT
3-  Everything You Wanted to Know About PAINTING and Were Afraid To Ask
4-  27 Ways To Improve HR MANAGEMENT
5- PA

In [29]:
rec_list = sample_recommendation_user(model=model, interactions=interactions, user_id = '5eece14efc13ae6609000047', user_dict=user_dict, item_dict=item_dict, threshold=0, nrec_items=10, show=True)

User 5eece14efc13ae6609000047
Known Likes:
1-  Who Else Wants To Be Successful With BANKING
2-  5 Ways To Get Through To Your BUSINESS
3-  Need More Time? Read These Tips To Eliminate BUSINESS
4-  The Ultimate Secret Of ZOOLOGY
5-  How You Can (Do) POLITICS In 24 Hours Or Less For Free
6-  The Death Of PHOTOGRAPHY And How To Avoid It
7-  Does BUSINESS Sometimes Make You Feel Stupid?
8-  In 10 Minutes, I'll Give You The Truth About DANCE
9-  Can You Pass The PROGRAMMING Test?
10-  Guaranteed No Stress ART
11-  The Lazy Way To CRAFT
12-  How To Lose Money With PAINTING
13-  5 Simple Steps To An Effective SCIENCE Strategy
14-  Find A Quick Way To PAINTING
15-  Are You Embarrassed By Your MATHEMATICS Skills? Here's What To Do
Recommended Items:
1-  These 5 Simple FASHION DESIGN Tricks Will Pump Up Your Sales Almost Instantly
2-  Got Stuck? Try These Tips To Streamline Your CRAFT
3-  Who Else Wants To Enjoy DRAWING
4-  The Number One Reason You Should (Do) BANKING
5-  How We Improved Our AR