In [1]:
import pandas as pd
import numpy as np
import re
import recommender as r
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
nlp = spacy.load('en_core_web_sm')

%matplotlib inline

df = pd.read_csv('user-item-interactions.csv')
df_content = pd.read_csv('articles_community.csv')
del df['Unnamed: 0']
del df_content['Unnamed: 0']

df = df.fillna('no_email')

# make sure all articles from df are in df_content
df = pd.merge(df, df_content[['article_id']], on='article_id', how='inner')

pre_data = dict(df.groupby(['email', 'article_id'])['article_id'].count())

list_email = []
list_article_id = []
list_nb_interactions = []

for key, val in pre_data.items():
    list_email.append(key[0])
    list_article_id.append(key[1])
    list_nb_interactions.append(val)
    
zipped_list = list(zip(list_email, list_article_id, list_nb_interactions))
interaction_user = pd.DataFrame(zipped_list, columns=['email','article_id','nb_interactions'])

df = pd.merge(df, interaction_user, on=['email', 'article_id'])

df_content = df_content.drop(labels='doc_status', axis=1)
df_content = df_content.drop_duplicates()

df = df.replace('no_email', np.nan)

def email_mapper():
    coded_dict = dict()
    cter = 1
    email_encoded = []
    
    for val in df['email']:
        if val not in coded_dict:
            coded_dict[val] = cter
            cter+=1
        
        email_encoded.append(coded_dict[val])
    return email_encoded

email_encoded = email_mapper()
del df['email']
df['user_id'] = email_encoded

#dict_views = dict(df.article_id.value_counts())
#df['popularity_article_nb_views'] = df.article_id.map(dict_views)


dict_interactions_total = dict(df.groupby('user_id')['article_id'].count())
df['nb_interactions_total'] = df.user_id.map(dict_interactions_total)

df['importance_article'] = np.where(df.nb_interactions_total > 19,
                                    df.nb_interactions/df.nb_interactions_total,
                                    np.nan)

# just for the module (later), I'll change the name doc_full_name to title
df_content['title'] = df_content['doc_full_name']
df_content = df_content.drop(labels='doc_full_name', axis=1)

# just for the module (later), I'll a column called date, set to 0
df['date'] = 0

df.article_id = df.article_id.astype('int64')
df_content.article_id = df_content.article_id.astype('int64')

In [2]:
df.head()

Unnamed: 0,article_id,title,nb_interactions,user_id,nb_interactions_total,importance_article,date
0,593,upload files to ibm data science experience us...,1,1,40,0.025,0
1,593,upload files to ibm data science experience us...,1,2,11,,0
2,593,upload files to ibm data science experience us...,2,3,3,,0
3,593,upload files to ibm data science experience us...,2,3,3,,0
4,593,upload files to ibm data science experience us...,1,4,1,,0


In [3]:
df_content.head()

Unnamed: 0,doc_body,doc_description,article_id,title
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,0,Detect Malfunctioning IoT Sensors with Streami...
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",1,Communicating data science: A guide to present...
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,2,"This Week in Data Science (April 18, 2017)"
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,3,DataLayer Conference: Boost the performance of...
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,4,Analyze NY Restaurant data using Spark in DSX


#### Engineering the feature Using NLP

In [4]:
def nlp_task(data):
    sent = data
    doc=nlp(sent)
    sub_toks = [tok for tok in doc if ((tok.dep_ == "compound") or (tok.dep_ == "dobj") or (tok.dep_ == "pobj")) ]
    insight_words = [str(i).lower() for i in sub_toks]
    insight_words = ','.join(insight_words)
    return insight_words

df_content['insights'] = df_content.title.apply(nlp_task)
df_content.head()

Unnamed: 0,doc_body,doc_description,article_id,title,insights
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,0,Detect Malfunctioning IoT Sensors with Streami...,"detect,malfunctioning,iot,streaming,analytics"
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",1,Communicating data science: A guide to present...,"data,science,work"
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,2,"This Week in Data Science (April 18, 2017)","data,science"
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,3,DataLayer Conference: Boost the performance of...,"datalayer,performance,database"
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,4,Analyze NY Restaurant data using Spark in DSX,"analyze,ny,restaurant,spark,dsx"


##### Create list of insights

In [5]:
insights_list = []

for i in df_content.insights:
    try:
        insights_list.extend(i.split(','))
    except AttributeError:
        pass

#normalize
insights_list = [i.lower() for i in insights_list]

#remove all non character from the list
regex = re.compile('[^a-zA-Z]')
insights_list = [regex.sub('non_charac', i) for i in insights_list]
insights_list = set(insights_list)
insights_list.remove('non_charac')

##### Fill df_content with columns of insight

In [6]:
def split_insights(val):
    try:
        if val.find(insight) > -1:
            return 1
        else:
            return 0
    except AttributeError:
        return 0
    
for insight in insights_list:
    df_content[insight] = df_content['insights'].apply(split_insights)

In [7]:
df_content.head()

Unnamed: 0,doc_body,doc_description,article_id,title,insights,Unnamed: 6,translating,campaign,ops,brewery,...,hashtags,apps,foundational,action,artificial,adversarial,summit,prem,serverless,datasets
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,0,Detect Malfunctioning IoT Sensors with Streami...,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",1,Communicating data science: A guide to present...,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,2,"This Week in Data Science (April 18, 2017)",0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,3,DataLayer Conference: Boost the performance of...,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,4,Analyze NY Restaurant data using Spark in DSX,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Remove some columns
df_content = df_content.drop(labels=['insights', 'doc_body', 'doc_description', ''], axis=1)
df = df.drop(labels=['nb_interactions', 'nb_interactions_total'], axis=1)

#### Recommendations

In [139]:
rec = r.Recommender(df_items=df_content,
                    df_reviews=df,
                    item_name_colname='title',
                    user_id_colname='user_id',
                    item_id_colname='article_id',
                    rating_col_name='importance_article',
                    date_col_name='date')

In [140]:
rec.fit(iters=200, latent_features=15)

Create User-Item matrix...
Train data with Funk Sigular Value Decomposition...
Iterations 		 Mean Squared Error 
	1 		 13.012184135500059 
	2 		 12.057805437759072 
	3 		 11.211637970456854 
	4 		 10.457326576568285 
	5 		 9.7815609964612 
	6 		 9.173410199553405 
	7 		 8.623822262571538 
	8 		 8.125244001829591 
	9 		 7.671328423152714 
	10 		 7.256707381182806 
	11 		 6.876813221918509 
	12 		 6.527737614322864 
	13 		 6.206118897140071 
	14 		 5.90905149203967 
	15 		 5.634012539623515 
	16 		 5.378802086040513 
	17 		 5.141494011225916 
	18 		 4.920395532237928 
	19 		 4.7140135976223725 
	20 		 4.521026854132409 
	21 		 4.340262146090047 
	22 		 4.170674722255931 
	23 		 4.0113314913171525 
	24 		 3.8613967967642537 
	25 		 3.720120283711936 
	26 		 3.5868265105999066 
	27 		 3.4609060225605086 
	28 		 3.341807654236752 
	29 		 3.229031870778695 
	30 		 3.122124988785418 
	31 		 3.02067414574675 
	32 		 2.92430290835504 
	33 		 2.832667427905891 
	34 		 2.745453065665916 
	35 		 2

#### investigate the user-Item matrix created

In [141]:
rec.user_item_df.head()

article_id,0,2,4,8,9,12,14,15,16,18,...,1028,1030,1035,1038,1042,1043,1044,1047,1048,1050
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,0.025,,,,...,,,,,0.025,,,0.025,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [142]:
def info():
    user_item = rec.user_item_df
    nb_user = rec.n_users
    nb_items = rec.n_items
    item_name = rec.item_name_colname
    item_id = rec.item_id_colname
    u_mat = rec.user_mat
    i_mat = rec.item_mat
    user_high_rate = list(dict(user_item.mean(axis=1).sort_values(ascending=False).head(1)).keys())[0]
    movie_id_high_rate = list(dict(user_item.mean(axis=0).sort_values(ascending=False).head(1)).keys())[0]
    movie_name_high_rate = tuple(df_content[df_content[item_id] == movie_id_high_rate][item_name])[0]
    

    print(f"Nb of users: {nb_user}")
    print(f"Nb of items: {nb_items}")
    print(f"The user_id with the highest avg rating given: {user_high_rate}")
    print(f"The article_id with the highest avg rating received: {movie_id_high_rate}")
    print(f"The article name with the highest avg rating received: {movie_name_high_rate}")
    print(f"Shape of the U matrix: {u_mat.shape}")
    print(f"Shape of the V(transpose) matrix: {i_mat.shape}")

In [143]:
info()

Nb of users: 4258
Nb of items: 437
The user_id with the highest avg rating given: 954
The article_id with the highest avg rating received: 366
The article name with the highest avg rating received: Clustering: A Guide for the Perplexed
Shape of the U matrix: (4258, 15)
Shape of the V(transpose) matrix: (15, 437)


#### Imagine that we have an article that we want to promote, but we want the top 10 users who may interested by this offer, the movie id is 984.

In [158]:
def may_interested_by(item_id, top_n=10):
    pred = {}
    # iterate over each users and predict the rate it will give to this movie
    for user in user_item.index:
        pred[user] = rec.predict_rating(user_id=user, item_id=item_id)

    top_10_pairs = sorted(pred.items(), key=lambda x: x[1], reverse=True)[:top_n]
    top_10_user_ids = []

    for i in top_10_pairs:
        top_10_user_ids.append(i[0])

    return top_10_user_ids

In [159]:
may_interested_by(984, 10)

[3358, 3889, 1405, 3402, 4193, 4192, 1149, 2896, 750, 2974]

#### To make recommendation: given an item id we want to find similar items to this item. Similarity are found by computing the dot product of items with its transpose, the more the result of an item-item pair is high, the more they have in common

In [149]:
print('prepare dot matrice for recommendations...')

def prep_get_similar_items():
    item_content = np.array(df_content.iloc[:,2:])
    item_content_transpose = np.transpose(item_content)
    dot_prod = item_content.dot(item_content_transpose)
    return dot_prod

dot_product_matrix = prep_get_similar_items()
print('Ok')

prepare dot matrice for recommendations...
Ok


In [150]:
dot_product_matrix

array([[9, 1, 1, ..., 1, 0, 1],
       [1, 4, 3, ..., 1, 0, 1],
       [1, 3, 3, ..., 1, 0, 1],
       ...,
       [1, 1, 1, ..., 5, 0, 1],
       [0, 0, 0, ..., 0, 2, 0],
       [1, 1, 1, ..., 1, 0, 2]])

##### As we can see, maximum values are in the diagonal because the most similar item to an item is itself

In [185]:
def display_recommendations(rec_ids, rec_names, message):
    
    if type(rec_ids) == type(None):
        print(f"{message}")
    
    else:
        dict_id_name = dict(zip(rec_ids, rec_names))
        
        print(f"{message}")
        
        for key, val  in dict_id_name.items():
            print(f"ID items: {key}")
            print(f"Name: {val}\n")

***
#### Existing user

- Because it is an existing user, recommendations are made using FunkSVD (matrix factorisation), it will predict the rating it will give to all items and get back the items associate with the the top predicted rate. The dot_product_matrix will not be used but is requiered in case of you want to find a similar item to another instead of finding best item for a user. 

In [186]:
rec_ids, rec_names, message = rec.make_recommendations(_id=3358,
                                                       dot_prod=dot_product_matrix,
                                                       _id_type='user',
                                                       rec_num=5)

display_recommendations(rec_ids, rec_names, message)

Glad to see you again! recommended for you:

ID items: 984
Name: Building a business that combines human experts and data science

ID items: 662
Name: Build Deep Learning Architectures With Neural Network Modeler

ID items: 763
Name: Load data into RStudio for analysis in DSX

ID items: 973
Name: Recent trends in recommender systems

ID items: 586
Name: The Data Processing Inequality



***
#### New User

- Because it is new user, recommendations are given using ranked based method, which simply return back the most popular items according to the ratings given by users, the number of ratings, the recency of the ratings. The dot_product_matrix will not be used but is requiered in case of you want to find a similar item to another instead of finding best item for a user.

In [187]:
rec_ids, rec_names, message = rec.make_recommendations(_id=8000,
                                                       dot_prod=dot_product_matrix,
                                                       _id_type='user',
                                                       rec_num=5)

display_recommendations(rec_ids, rec_names, message)

Hey, you are new here, this is for you:

ID items: 251
Name: Data science expert interview: Dez Blanchfield, Craig Brown, David Mathison, Jennifer Shin and Mike Tamir part 2

ID items: 1018
Name: Clustering: A Guide for the Perplexed

ID items: 977
Name: Rapidly build Machine Learning flows with DSX

ID items: 366
Name: Apache Spark as the New Engine of Genomics

ID items: 732
Name: 7 types of job profiles that makes you a Data Scientist



****
#### Existing items

- Here we enter an item id and would like to find similar items using the dot_product_matrix computed earlier.

In [188]:
rec_ids, rec_names, message = rec.make_recommendations(_id=100,
                                                       dot_prod=dot_product_matrix,
                                                       _id_type='item',
                                                       rec_num=5, window=1)

display_recommendations(rec_ids, rec_names, message)

Similar items for id:100, corresponding to Use data assets in a project using IBM Data Catalog:

ID items: 5
Name: Browsing PostgreSQL Data with Compose

ID items: 16
Name: Higher-order Logistic Regression for Large Datasets

ID items: 30
Name: How open API economy accelerates the growth of big data and analytics

ID items: 43
Name: Deep Learning With Tensorflow Course by Big Data University

ID items: 49
Name: GeoFile: Using OpenStreetMap Data in Compose PostgreSQL - Part II



#### Simulate an error by passing a non existing item id

In [189]:
rec_ids, rec_names, message = rec.make_recommendations(_id=187600,
                                                       dot_prod=dot_product_matrix,
                                                       _id_type='item',
                                                       rec_num=5, window=1)

display_recommendations(rec_ids, rec_names, message)

We can't make recommendation for this item, please makesure the data was updated with this item.

