### Some Cleaning steps

In [20]:
import pandas as pd
import numpy as np
import re
import recommender as r
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
nlp = spacy.load('en_core_web_sm')

%matplotlib inline

df = pd.read_csv('user-item-interactions.csv')
df_content = pd.read_csv('articles_community.csv')
del df['Unnamed: 0']
del df_content['Unnamed: 0']

# make sure all articles from df are in df_content
df = pd.merge(df, df_content[['article_id']], on='article_id', how='inner')

df_content = df_content.drop(labels='doc_status', axis=1)
df_content = df_content.drop_duplicates()

#df = df.replace('no_email', np.nan)

def email_mapper():
    coded_dict = dict()
    cter = 1
    email_encoded = []
    
    for val in df['email']:
        if val not in coded_dict:
            coded_dict[val] = cter
            cter+=1
        
        email_encoded.append(coded_dict[val])
    return email_encoded

email_encoded = email_mapper()
del df['email']
df['user_id'] = email_encoded

# Rename my article name field to be the same in both dfs
df_content['title'] = df_content['doc_full_name']
df_content = df_content.drop(labels='doc_full_name', axis=1)

df.article_id = df.article_id.astype('int64')
df_content.article_id = df_content.article_id.astype('int64')

In [21]:
df.head()

Unnamed: 0,article_id,title,user_id
0,593,upload files to ibm data science experience us...,1
1,593,upload files to ibm data science experience us...,2
2,593,upload files to ibm data science experience us...,3
3,593,upload files to ibm data science experience us...,4
4,593,upload files to ibm data science experience us...,3


In [22]:
df_content.head()

Unnamed: 0,doc_body,doc_description,article_id,title
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,0,Detect Malfunctioning IoT Sensors with Streami...
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",1,Communicating data science: A guide to present...
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,2,"This Week in Data Science (April 18, 2017)"
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,3,DataLayer Conference: Boost the performance of...
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,4,Analyze NY Restaurant data using Spark in DSX


#### Engineering the feature Using NLP

In [23]:
def nlp_task(data):
    sent = data
    doc=nlp(sent)
    sub_toks = [tok for tok in doc if ((tok.dep_ == "compound") or (tok.dep_ == "dobj") or (tok.dep_ == "pobj")) ]
    insight_words = [str(i).lower() for i in sub_toks]
    insight_words = ','.join(insight_words)
    return insight_words

df_content['insights'] = df_content.title.apply(nlp_task)
df_content.head()

Unnamed: 0,doc_body,doc_description,article_id,title,insights
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,0,Detect Malfunctioning IoT Sensors with Streami...,"detect,malfunctioning,iot,streaming,analytics"
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",1,Communicating data science: A guide to present...,"data,science,work"
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,2,"This Week in Data Science (April 18, 2017)","data,science"
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,3,DataLayer Conference: Boost the performance of...,"datalayer,performance,database"
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,4,Analyze NY Restaurant data using Spark in DSX,"analyze,ny,restaurant,spark,dsx"


##### Create list of insights

In [24]:
insights_list = []

for i in df_content.insights:
    try:
        insights_list.extend(i.split(','))
    except AttributeError:
        pass

#normalize
insights_list = [i.lower() for i in insights_list]

#remove all non character from the list
regex = re.compile('[^a-zA-Z]')
insights_list = [regex.sub('non_charac', i) for i in insights_list]
insights_list = set(insights_list)
insights_list.remove('non_charac')

##### Fill df_content with columns of insight

In [25]:
def split_insights(val):
    try:
        if val.find(insight) > -1:
            return 1
        else:
            return 0
    except AttributeError:
        return 0
    
for insight in insights_list:
    df_content[insight] = df_content['insights'].apply(split_insights)

In [26]:
# Remove some columns
df_content = df_content.drop(labels=['insights', 'doc_body', 'doc_description', ''], axis=1)

df_content.head()

Unnamed: 0,article_id,title,scala,queries,seium,traffic,recommendation,dinesh,examples,eye,...,random,full,sets,red,chronic,tidyr,pis,simple,sector,web
0,0,Detect Malfunctioning IoT Sensors with Streami...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Communicating data science: A guide to present...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,"This Week in Data Science (April 18, 2017)",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,DataLayer Conference: Boost the performance of...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,Analyze NY Restaurant data using Spark in DSX,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# number of times the user seen the article
pre_data = dict(df.groupby(['user_id', 'article_id'])['article_id'].count())

list_user = []
list_article_id = []
list_nb_interactions = []

for key, val in pre_data.items():
    list_user.append(key[0])
    list_article_id.append(key[1])
    list_nb_interactions.append(val)
    
zipped_list = list(zip(list_user, list_article_id, list_nb_interactions))
interaction_user = pd.DataFrame(zipped_list, columns=['user_id','article_id','nb_interactions_user_article'])

df = pd.merge(df, interaction_user, on=['user_id','article_id'])

# The nb interactions was shown implicitly with the number of rows
# Now I have information about the number of interactions by creating this column
# I can drop duplicated rows
df = df.drop_duplicates(keep='first')
df = df.reset_index().drop(labels='index', axis=1)
df.head()

Unnamed: 0,article_id,title,user_id,nb_interactions_user_article
0,593,upload files to ibm data science experience us...,1,1
1,593,upload files to ibm data science experience us...,2,1
2,593,upload files to ibm data science experience us...,3,2
3,593,upload files to ibm data science experience us...,4,1
4,593,upload files to ibm data science experience us...,5,1


In [37]:
art_nb_inter = dict(df.groupby('article_id')['nb_interactions_user_article'].count())
df['nb_views_of_the_article'] = df.article_id.map(art_nb_inter)
df.head()

Unnamed: 0,article_id,title,user_id,nb_interactions_user_article,nb_views_of_the_article
0,593,upload files to ibm data science experience us...,1,1,108
1,593,upload files to ibm data science experience us...,2,1,108
2,593,upload files to ibm data science experience us...,3,2,108
3,593,upload files to ibm data science experience us...,4,1,108
4,593,upload files to ibm data science experience us...,5,1,108


In [40]:
avg_nb_interactions_of_the_article = dict(df.groupby('article_id')['nb_interactions_user_article'].mean())
df['avg_nb_interactions_of_the_article'] = df.article_id.map(avg_nb_interactions_of_the_article)
df.head()

Unnamed: 0,article_id,title,user_id,nb_interactions_user_article,nb_views_of_the_article,avg_nb_interactions_of_the_article
0,593,upload files to ibm data science experience us...,1,1,108,1.185185
1,593,upload files to ibm data science experience us...,2,1,108,1.185185
2,593,upload files to ibm data science experience us...,3,2,108,1.185185
3,593,upload files to ibm data science experience us...,4,1,108,1.185185
4,593,upload files to ibm data science experience us...,5,1,108,1.185185


In [47]:
# Compute Weighted rate

v = df.nb_views_of_the_article
m = df.nb_views_of_the_article.quantile(0.80)
R = df.avg_nb_interactions_of_the_article
C = df.nb_interactions_user_article.mean()

df['rank_score'] = (v/(v+m) * R) + (m/(m+v) * C)

df.head()

Unnamed: 0,article_id,title,user_id,nb_interactions_user_article,nb_views_of_the_article,avg_nb_interactions_of_the_article,rank_score
0,593,upload files to ibm data science experience us...,1,1,108,1.185185,1.197335
1,593,upload files to ibm data science experience us...,2,1,108,1.185185,1.197335
2,593,upload files to ibm data science experience us...,3,2,108,1.185185,1.197335
3,593,upload files to ibm data science experience us...,4,1,108,1.185185,1.197335
4,593,upload files to ibm data science experience us...,5,1,108,1.185185,1.197335


In [58]:
ranked = df.sort_values(by=['rank_score'], ascending=False)



[221,
 50,
 232,
 43,
 651,
 732,
 12,
 29,
 943,
 237,
 108,
 510,
 213,
 486,
 398,
 684,
 977,
 725,
 969,
 542,
 241,
 268,
 20,
 251,
 353,
 1016,
 409,
 1018,
 600,
 936,
 729,
 448,
 477,
 390,
 865,
 250,
 910,
 812,
 33,
 795,
 437,
 1028,
 366,
 151,
 606,
 686,
 681,
 557,
 607,
 369,
 793,
 2,
 673,
 891,
 844,
 933,
 730,
 428,
 751,
 722,
 240,
 645,
 705,
 1000,
 949,
 1008,
 1025,
 678,
 164,
 825,
 641,
 766,
 655,
 109,
 53,
 555,
 521,
 429,
 609,
 122,
 930,
 316,
 588,
 679,
 788,
 362,
 323,
 760,
 1050,
 876,
 534,
 1038,
 857,
 517,
 961,
 853,
 616,
 303,
 153,
 482,
 887,
 965,
 76,
 355,
 302,
 881,
 463,
 749,
 299,
 846,
 399,
 668,
 492,
 422,
 724,
 653,
 662,
 417,
 984,
 974,
 675,
 404,
 903,
 444,
 412,
 508,
 346,
 124,
 870,
 972,
 940,
 364,
 575,
 708,
 384,
 636,
 778,
 443,
 758,
 430,
 416,
 644,
 757,
 677,
 947,
 499,
 504,
 376,
 586,
 379,
 935,
 544,
 0,
 266,
 194,
 996,
 631,
 363,
 395,
 375,
 880,
 98,
 78,
 420,
 985,
 800,
 997,
 610

I think a user-user CF is more interested, because we can't have ratings, so we can just say that a user is interested by an article only if he
has interacted with it.
So the whole interactions in the dataset means (OK I'm interested).

So by now, I'll create a col called interacted full of 1.
When the user-item matrix will be create, we will get a matrice with value 1 if a user has interacted with an article
np.nan otherwise.

I don't have any date of interraction, so just for the module, I'll simulate a date field set to 0. Popularity of an item is based on ratings, then number of rating, then, the recency on the rating. By given a date field set to the same number for all observations, the popularity of the item is not influenced by a fake date.

In [None]:
#df['interacted'] = 1
#df['date'] = 0
#df.head()

#### Recommendations

In [20]:
rec = r.Recommender(df_items=df_content,          # df that contains all the items with description and more
                    df_reviews=df,                # df that contains interactions between users and items
                    item_name_colname='title',    # The title column of the df (this can be use with the 1st df or the 2nd, that why I wanted the same name for both)
                    user_id_colname='user_id',    # The name of the user id column
                    item_id_colname='article_id', # The name of the item id column
                    rating_col_name='interacted', # The rating column
                    date_col_name='date')         # The date column

In [21]:
rec.fit(iters=100, latent_features=10)

Create User-Item matrix...
Train data with Funk Sigular Value Decomposition...
Iterations 		 Mean Squared Error 
	1 		 2.729241626990958 
	2 		 2.4508568219698725 
	3 		 2.217445275518026 
	4 		 2.019644867064396 
	5 		 1.8503764478111702 
	6 		 1.7042216616859707 
	7 		 1.5769862751279797 
	8 		 1.465389066539344 
	9 		 1.3668372158608009 
	10 		 1.2792622134197655 
	11 		 1.20099869049061 
	12 		 1.130694060855732 
	13 		 1.0672405207644828 
	14 		 1.0097234338216763 
	15 		 0.9573818320541022 
	16 		 0.9095779518561473 
	17 		 0.8657735602633064 
	18 		 0.8255114227699649 
	19 		 0.7884006920953767 
	20 		 0.7541053076878689 
	21 		 0.7223347225278133 
	22 		 0.6928364406800697 
	23 		 0.6653899727093928 
	24 		 0.6398019082948521 
	25 		 0.6159018745779214 
	26 		 0.5935392010067947 
	27 		 0.5725801510832773 
	28 		 0.5529056116767356 
	29 		 0.5344091537886855 
	30 		 0.5169953965637945 
	31 		 0.500578620233852 
	32 		 0.48508158450855543 
	33 		 0.47043451741008674 
	34 		 0.45

#### investigate the user-Item matrix created

In [22]:
rec.user_item_df.head()

article_id,0,2,4,8,9,12,14,15,16,18,...,1028,1030,1035,1038,1042,1043,1044,1047,1048,1050
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,1.0,,,,...,,,,,1.0,,,1.0,,
2,,,,,,1.0,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


##### Get some Infos

In [24]:
def info():
    user_item = rec.user_item_df
    nb_user = rec.n_users
    nb_items = rec.n_items
    item_name = rec.item_name_colname
    item_id = rec.item_id_colname
    u_mat = rec.user_mat
    i_mat = rec.item_mat
    user_high_rate = list(dict(user_item.mean(axis=1).sort_values(ascending=False).head(1)).keys())[0]
    movie_id_high_rate = list(dict(user_item.mean(axis=0).sort_values(ascending=False).head(1)).keys())[0]
    movie_name_high_rate = tuple(df_content[df_content[item_id] == movie_id_high_rate][item_name])[0]
    

    print(f"Nb of users: {nb_user}")
    print(f"Nb of items: {nb_items}")
    print(f"The user_id with the highest avg rating given: {user_high_rate}")
    print(f"The article_id with the highest avg rating received: {movie_id_high_rate}")
    print(f"The article name with the highest avg rating received: {movie_name_high_rate}")
    print(f"Shape of the U matrix: {u_mat.shape}")
    print(f"Shape of the V(transpose) matrix: {i_mat.shape}")

info()

Nb of users: 4258
Nb of items: 437
The user_id with the highest avg rating given: 4258
The article_id with the highest avg rating received: 1050
The article name with the highest avg rating received: Jupyter Notebooks with Scala, Python, or R Kernels
Shape of the U matrix: (4258, 10)
Shape of the V(transpose) matrix: (10, 437)


****
##### To make recommendation: given an item id we want to find similar items to this item. Similarity are found by computing the dot product of items with its transpose, the more the result of an item-item pair is high, the more they have in common.

In [25]:
def prep_get_similar_items():
    item_content = np.array(df_content.iloc[:,2:]) # subset of the df of items that contains only dummy variables that you have created
    item_content_transpose = np.transpose(item_content)
    dot_prod = item_content.dot(item_content_transpose)
    return dot_prod

dot_product_matrix = prep_get_similar_items()

##### The same here for user similarity (CF user-based)

In [26]:
df_user_similarity = rec.user_item_df.reset_index().replace(np.nan, 0)
def prep_get_similar_user():
    user_content = np.array(df_user_similarity.iloc[:,1:])
    user_content_transpose = np.transpose(user_content)
    dot_prod = user_content.dot(user_content_transpose)
    return dot_prod

dot_product_matrix_user = prep_get_similar_user()

#### Imagine that we have an article that we want to promote, but we want the top 10 users who may interested by this offer, the item id is 984.

In [29]:
user_item = rec.user_item_df
def may_interested_by(item_id, top_n=10):
    pred = {}
    # iterate over each users and predict the rate it will give to this movie
    for user in user_item.index:
        pred[user] = rec.predict_rating(user_id=user, item_id=item_id)

    top_10_pairs = sorted(pred.items(), key=lambda x: x[1], reverse=True)[:top_n]
    top_10_user_ids = []

    for i in top_10_pairs:
        top_10_user_ids.append(i[0])

    return top_10_user_ids

print('These users may interested by this item:')
for i in may_interested_by(984, 10):
    print(f"- {i}")

These users may interested by this item:
- 3845
- 2215
- 1937
- 2282
- 3710
- 871
- 3222
- 4209
- 1668
- 2196


##### Prepare a function to display results

In [30]:
def display_recommendations(rec_ids, rec_names, message, rec_ids_users, rec_user_articles):
    
    if type(rec_ids) == type(None):
        print(f"{message}")
    
    else:
        dict_id_name = dict(zip(rec_ids, rec_names))
        
        if type(rec_ids_users) != type(None):
            print('Matrix Factorisation SVD:')
            print(f"\t{message}")
            
            for key, val  in dict_id_name.items():
                print(f"\t- ID items: {key}")
                print(f"\tName: {val}\n")

            print('CF User Based:')
            print('\tUser that are similar to you also seen:\n')
            for i in rec_user_articles[:5]:
                print(f"\t- {i}")
        else:
            print(f"\t{message}")
            dict_id_name = dict(zip(rec_ids, rec_names))
            for key, val  in dict_id_name.items():
                print(f"\t- ID items: {key}")
                print(f"\tName: {val}\n")

***
#### Existing user

- Because it is an existing user, recommendations are made using FunkSVD (matrix factorisation), it will predict the rating it will give to all items and get back the items associate with the the top predicted rate. The dot_product_matrix will not be used but is requiered in case of you want to find a similar item to another instead of finding best item for a user. 

In [31]:
rec_ids, rec_names, message, rec_ids_users, rec_user_articles = rec.make_recommendations(_id=3,
                                                                                         dot_prod=dot_product_matrix,
                                                                                         dot_prod_user= dot_product_matrix_user,
                                                                                         _id_type='user',
                                                                                         rec_num=5)
display_recommendations(rec_ids, rec_names, message, rec_ids_users, rec_user_articles)

Matrix Factorisation SVD:
	Glad to see you again! recommended for you:

	- ID items: 961
	Name: Run DSX Notebooks on Amazon EMR

	- ID items: 675
	Name: Load Db2 Warehouse on Cloud data with Apache Spark in DSX

	- ID items: 443
	Name: Webinar: April 11 - Thinking inside the box: you can do that inside a data frame?!

	- ID items: 395
	Name: Don’t overlook simpler techniques and algorithms

	- ID items: 884
	Name: Beyond Parallelize and Collect

CF User Based:
	User that are similar to you also seen:

	- 0 to life-changing app: new apache systemml api on spark shell
	- jupyter notebook tutorial
	- developing for the ibm streaming analytics service
	- brunel 2.0 preview
	- tensorflow quick tips


***
#### New User

- Because it is new user, recommendations are given using ranked based method, which simply return back the most popular items according to the ratings given by users, the number of ratings, the recency of the ratings. The dot_product_matrix will not be used but is requiered in case of you want to find a similar item to another instead of finding best item for a user.

In [32]:
rec_ids, rec_names, message, rec_ids_users, rec_user_articles = rec.make_recommendations(_id=8000,
                                                                                         dot_prod=dot_product_matrix,
                                                                                         dot_prod_user= dot_product_matrix_user,
                                                                                         _id_type='user',
                                                                                         rec_num=5)

print('Ranked Based:')
display_recommendations(rec_ids, rec_names, message, rec_ids_users, rec_user_articles)

Ranked Based:
	Hey, you are new here, this is for you:

	- ID items: 43
	Name: Working interactively with RStudio and notebooks in DSX

	- ID items: 151
	Name: Deep Learning With Tensorflow Course by Big Data University

	- ID items: 124
	Name: Python Machine Learning: Scikit-Learn Tutorial

	- ID items: 390
	Name: Jupyter Notebook Tutorial

	- ID items: 20
	Name: Introducing IBM Watson Studio 



****
#### Existing items

- Here we enter an item id and would like to find similar items using the dot_product_matrix computed earlier.

In [41]:
rec_ids, rec_names, message, rec_ids_users, rec_user_articles = rec.make_recommendations(_id=100,
                                                                                         dot_prod=dot_product_matrix,
                                                                                         dot_prod_user= dot_product_matrix_user,
                                                                                         _id_type='item',
                                                                                         rec_num=5, window=3) #change the window arg if you don't have the rec_num declared 

print('CF Content Based:')
display_recommendations(rec_ids, rec_names, message, rec_ids_users, rec_user_articles)

CF Content Based:
	Similar items for id:100, corresponding to Use data assets in a project using IBM Data Catalog:

	- ID items: 16
	Name: Higher-order Logistic Regression for Large Datasets

	- ID items: 100
	Name: Use data assets in a project using IBM Data Catalog

	- ID items: 349
	Name: IBM Data Science Experience White paper - SparkR Transforming R into a tool for big data analytics

	- ID items: 417
	Name: IBM Data Catalog Overview

	- ID items: 453
	Name: Baby’s first IBM Graph app using Node.js – IBM Watson Data Lab



#### Simulate an error by passing a non existing item id

In [42]:
rec_ids, rec_names, message, rec_ids_users, rec_user_articles = rec.make_recommendations(_id=1087630,
                                                                                         dot_prod=dot_product_matrix,
                                                                                         dot_prod_user= dot_product_matrix_user,
                                                                                         _id_type='item',
                                                                                         rec_num=5, window=2)


display_recommendations(rec_ids, rec_names, message, rec_ids_users, rec_user_articles)

We can't make recommendation for this item, please makesure the data was updated with this item.

