In [1]:
import pandas as pd
import numpy as np
import recommender as r

# Nlp
import re
import nltk
from nltk.corpus import stopwords
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
#nlp = spacy.load('en_core_web_sm')

# Viz
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Data analysis + Cleaning steps

In [7]:
df = pd.read_csv('user-item-interactions.csv')
df_content = pd.read_csv('articles_community.csv')
del df['Unnamed: 0']
del df_content['Unnamed: 0']

# make sure all articles from df are in df_content
df = pd.merge(df, df_content[['article_id']], on='article_id', how='inner')

df_content = df_content.drop(labels='doc_status', axis=1)
df_content = df_content.drop_duplicates()

#df = df.replace('no_email', np.nan)

def email_mapper():
    coded_dict = dict()
    cter = 1
    email_encoded = []
    
    for val in df['email']:
        if val not in coded_dict:
            coded_dict[val] = cter
            cter+=1
        
        email_encoded.append(coded_dict[val])
    return email_encoded

email_encoded = email_mapper()
del df['email']
df['user_id'] = email_encoded

# Rename my article name field to be the same in both dfs
df_content['title'] = df_content['doc_full_name']
df_content = df_content.drop(labels='doc_full_name', axis=1)

df.article_id = df.article_id.astype('int64')
df_content.article_id = df_content.article_id.astype('int64')

In [8]:
df.head()

Unnamed: 0,article_id,title,user_id
0,593,upload files to ibm data science experience us...,1
1,593,upload files to ibm data science experience us...,2
2,593,upload files to ibm data science experience us...,3
3,593,upload files to ibm data science experience us...,4
4,593,upload files to ibm data science experience us...,3


In [9]:
df_content.head()

Unnamed: 0,doc_body,doc_description,article_id,title
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,0,Detect Malfunctioning IoT Sensors with Streami...
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",1,Communicating data science: A guide to present...
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,2,"This Week in Data Science (April 18, 2017)"
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,3,DataLayer Conference: Boost the performance of...
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,4,Analyze NY Restaurant data using Spark in DSX


In [10]:
# number of times the user seen the article
pre_data = dict(df.groupby(['user_id', 'article_id'])['article_id'].count())

list_user = []
list_article_id = []
list_nb_interactions = []

for key, val in pre_data.items():
    list_user.append(key[0])
    list_article_id.append(key[1])
    list_nb_interactions.append(val)
    
zipped_list = list(zip(list_user, list_article_id, list_nb_interactions))
interaction_user = pd.DataFrame(zipped_list, columns=['user_id','article_id','nb_interactions_user_article'])

df = pd.merge(df, interaction_user, on=['user_id','article_id'])

# The nb interactions was shown implicitly with the number of rows
# Now I have information about the number of interactions by creating this column
# I can drop duplicated rows
df = df.drop_duplicates(keep='first')
df = df.reset_index().drop(labels='index', axis=1)
df.head()

Unnamed: 0,article_id,title,user_id,nb_interactions_user_article
0,593,upload files to ibm data science experience us...,1,1
1,593,upload files to ibm data science experience us...,2,1
2,593,upload files to ibm data science experience us...,3,2
3,593,upload files to ibm data science experience us...,4,1
4,593,upload files to ibm data science experience us...,5,1


I don't have any date of the interraction between user and article, so just for the module, I'll simulate a date field set to 0. Popularity of an item is computed by the weighted rating calcul shown in recommender_function.py, but in case of tie, the more recent date will be first, so by putting a same value for all the records, I'm not influence the importance of an article. Here I chose 0

In [11]:
df['date'] = 0

### Clean Pipeline

In [2]:
def clean_pipeline(df_reviews, df_items):
    
    df = df_reviews.copy()
    df_content = df_items.copy()
    
    if 'Unnamed: 0' in df.columns:
        del df['Unnamed: 0']
        
    if 'Unnamed: 0' in df_content.columns:
        del df_content['Unnamed: 0']

    # make sure all articles from df are in df_content
    df = pd.merge(df, df_content[['article_id']], on='article_id', how='inner')

    df_content = df_content.drop(labels='doc_status', axis=1)
    df_content = df_content.drop_duplicates()

    #df = df.replace('no_email', np.nan)

    def email_mapper():
        coded_dict = dict()
        cter = 1
        email_encoded = []

        for val in df['email']:
            if val not in coded_dict:
                coded_dict[val] = cter
                cter+=1

            email_encoded.append(coded_dict[val])
        return email_encoded

    email_encoded = email_mapper()
    del df['email']
    df['user_id'] = email_encoded

    # Rename my article name field to be the same in both dfs
    df_content['title'] = df_content['doc_full_name']
    df_content = df_content.drop(labels='doc_full_name', axis=1)

    df.article_id = df.article_id.astype('int64')
    df_content.article_id = df_content.article_id.astype('int64')

    # number of times the user seen the article
    pre_data = dict(df.groupby(['user_id', 'article_id'])['article_id'].count())

    list_user = []
    list_article_id = []
    list_nb_interactions = []

    for key, val in pre_data.items():
        list_user.append(key[0])
        list_article_id.append(key[1])
        list_nb_interactions.append(val)

    zipped_list = list(zip(list_user, list_article_id, list_nb_interactions))
    interaction_user = pd.DataFrame(zipped_list, columns=['user_id','article_id','nb_interactions_user_article'])

    df = pd.merge(df, interaction_user, on=['user_id','article_id'])

    # The nb interactions was shown implicitly with the number of rows
    # Now I have information about the number of interactions by creating this column
    # I can drop duplicated rows
    df = df.drop_duplicates(keep='first')
    df = df.reset_index().drop(labels='index', axis=1)

    df['date'] = 0
    
    return df, df_content

In [3]:
df = pd.read_csv('user-item-interactions.csv')
df_content = pd.read_csv('articles_community.csv')

df, df_content = clean_pipeline(df_reviews=df,
                                df_items=df_content)

##### Prepare TFIDF Matrice for Content Based Recommendations

In [4]:
def tokenize(text):
    """
    This function do the following steps:
    - Normalize, remove ponctuations
    - Tokenize the input (str)
    - Remove english stopwords
    - Lemmatize but keep the gramatical sense of the word
    - Clean White spaces
    """
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    words = word_tokenize(text)
    words = [w for w in words if w not in stopwords.words("english")]
    lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in words]
    words_clean = [word.strip() for word in words]
    return words_clean

In [5]:
tfidf = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 2))
df_content['doc_description'] = df_content['doc_description'].fillna('')
tfidf_matrix = tfidf.fit_transform(df_content['doc_description'])

##### Prepare User-Item Matrix to feed my recommender object

In [6]:
def create_user_item():
    user_item = df[['user_id',
                    'article_id',
                    'nb_interactions_user_article']]
    
    user_item_df=(
        user_item.groupby(['user_id',
                           'article_id'])['nb_interactions_user_article'].max().unstack()
    )
    
    return user_item_df

In [7]:
user_item_df = create_user_item()

****
****

#### Recommendations

In [8]:
rec = r.Recommender(df_items=df_content,                            # df that contains all the items with description and more
                    df_reviews=df,                                  # df that contains interactions between users and items
                    user_item_df=user_item_df,                      # A user item df
                    item_name_colname='title',                      # The title column of the df (this can be use with the 1st df or the 2nd, that why I wanted the same name for both)
                    user_id_colname='user_id',                      # The name of the user id column
                    item_id_colname='article_id',                   # The name of the item id column
                    rating_col_name='nb_interactions_user_article', # The rating column
                    date_col_name='date')                           # The date column

In [27]:
rec.fit(iters=100, latent_features=5)

Train data with Funk Sigular Value Decomposition...
Iterations 		 Mean Squared Error 
	1 		 0.6623863732481259 
	2 		 0.6544580615101401 
	3 		 0.6470075236738265 
	4 		 0.6399923516660931 
	5 		 0.6333746904568995 
	6 		 0.627120691241326 
	7 		 0.6212000355791115 
	8 		 0.6155855206583186 
	9 		 0.6102526973050068 
	10 		 0.6051795535851098 
	11 		 0.6003462378771757 
	12 		 0.5957348161680779 
	13 		 0.5913290590641385 
	14 		 0.5871142546395772 
	15 		 0.5830770437798646 
	16 		 0.5792052751353941 
	17 		 0.5754878771915503 
	18 		 0.5719147452967154 
	19 		 0.5684766417771546 
	20 		 0.5651651075148402 
	21 		 0.5619723835771527 
	22 		 0.5588913416708042 
	23 		 0.5559154223506516 
	24 		 0.5530385800510309 
	25 		 0.5502552341257603 
	26 		 0.5475602251856406 
	27 		 0.5449487761112689 
	28 		 0.5424164571966155 
	29 		 0.5399591549459256 
	30 		 0.537573044105049 
	31 		 0.535254562559597 
	32 		 0.5330003887765046 
	33 		 0.5308074215046945 
	34 		 0.5286727614843448 
	35 		 0

#### investigate the user-Item matrix

##### Get some Infos

In [28]:
def info():
    user_item = user_item_df
    nb_user = rec.n_users
    nb_items = rec.n_items
    item_name = rec.item_name_colname
    item_id = rec.item_id_colname
    u_mat = rec.user_mat
    i_mat = rec.item_mat
    user_high_rate = list(dict(user_item.mean(axis=1).sort_values(ascending=False).head(1)).keys())[0]
    movie_id_high_rate = list(dict(user_item.mean(axis=0).sort_values(ascending=False).head(1)).keys())[0]
    movie_name_high_rate = tuple(df_content[df_content[item_id] == movie_id_high_rate][item_name])[0]
    

    print(f"Nb of users: {nb_user}")
    print(f"Nb of items: {nb_items}")
    print(f"The user_id with the highest avg rating given: {user_high_rate}")
    print(f"The article_id with the highest avg rating received: {movie_id_high_rate}")
    print(f"The article name with the highest avg rating received: {movie_name_high_rate}")
    print(f"Shape of the U matrix: {u_mat.shape}")
    print(f"Shape of the V(transpose) matrix: {i_mat.shape}")

info()

Nb of users: 4258
Nb of items: 437
The user_id with the highest avg rating given: 1102
The article_id with the highest avg rating received: 50
The article name with the highest avg rating received: Graph-based machine learning
Shape of the U matrix: (4258, 5)
Shape of the V(transpose) matrix: (5, 437)


****
##### To make recommendation: given a user id we want to find similar users to this user in order to recommend items. Similarity are found by computing the dot product of characteristics of the user with its transpose, the more the result of an user-user pair is high, the more they have in common.

In [29]:
df_user_similarity = user_item_df.reset_index().replace(np.nan, 0)
def prep_get_similar_user():
    user_content = np.array(df_user_similarity.iloc[:,1:])
    user_content_transpose = np.transpose(user_content)
    dot_prod = user_content.dot(user_content_transpose)
    return dot_prod

dot_product_matrix_user = prep_get_similar_user()

##### Prepare a function to display results

In [30]:
def display_recommendations(rec_ids, rec_names, message, rec_ids_users, rec_user_articles):
    
    if type(rec_ids) == type(None):
        print(f"{message}")
    
    else:
        dict_id_name = dict(zip(rec_ids, rec_names))
        
        if type(rec_ids_users) != type(None):
            print('Matrix Factorisation SVD:')
            print(f"\t{message}")
            
            for key, val  in dict_id_name.items():
                print(f"\t- ID items: {key}")
                print(f"\tName: {val}\n")

            print('CF User Based:')
            print('\tUser that are similar to you also seen:\n')
            for i in rec_user_articles[:5]:
                print(f"\t- {i}")
        else:
            print(f"\t{message}")
            dict_id_name = dict(zip(rec_ids, rec_names))
            for key, val  in dict_id_name.items():
                print(f"\t- ID items: {key}")
                print(f"\tName: {val}\n")

***
#### Existing user

- Because it is an existing user, recommendations are made using FunkSVD (matrix factorisation), it will predict the rating it will give to all items and get back the items associate with the the top predicted rate. The dot product matrix is used here in order to find what other users similar to this user have seen.

In [31]:
rec_ids, rec_names, message, rec_ids_users, rec_user_articles = rec.make_recommendations(_id=3,
                                                                                         dot_prod_user= dot_product_matrix_user,
                                                                                         tfidf_matrix=tfidf_matrix,
                                                                                         _id_type='user',
                                                                                         rec_num=5)
display_recommendations(rec_ids, rec_names, message, rec_ids_users, rec_user_articles)

Matrix Factorisation SVD:
	Glad to see you again! recommended for you:

	- ID items: 50
	Name: Graph-based machine learning

	- ID items: 437
	Name: IBM Watson Machine Learning: Get Started

	- ID items: 232
	Name: Self-service data preparation with IBM Data Refinery

	- ID items: 534
	Name: dplyr 0.5.0

	- ID items: 221
	Name: How smart catalogs can turn the big data flood into an ocean of opportunity

CF User Based:
	User that are similar to you also seen:

	- upload files to ibm data science experience using the command line


***
#### New User

- Because it is new user, recommendations are given using ranked based method, which simply return back the most popular items according to the ratings given by users, the number of ratings, the recency of the ratings. The dot_product_matrix will not be used but is requiered.

In [32]:
rec_ids, rec_names, message, rec_ids_users, rec_user_articles = rec.make_recommendations(_id=8000,
                                                                                         dot_prod_user= dot_product_matrix_user,
                                                                                         tfidf_matrix=tfidf_matrix,
                                                                                         _id_type='user',
                                                                                         rec_num=5)

print('Ranked Based:')
display_recommendations(rec_ids, rec_names, message, rec_ids_users, rec_user_articles)

Ranked Based:
	Hey, you are new here, this is for you:

	- ID items: 221
	Name: How smart catalogs can turn the big data flood into an ocean of opportunity

	- ID items: 50
	Name: Graph-based machine learning

	- ID items: 232
	Name: Self-service data preparation with IBM Data Refinery

	- ID items: 43
	Name: Deep Learning With Tensorflow Course by Big Data University

	- ID items: 732
	Name: Rapidly build Machine Learning flows with DSX



****
#### Existing items

- Here we enter an item id and would like to find similar items using the tfidf matrice you've computed before.

In [33]:
rec_ids, rec_names, message, rec_ids_users, rec_user_articles = rec.make_recommendations(_id=593,
                                                                                         dot_prod_user= dot_product_matrix_user,
                                                                                         tfidf_matrix=tfidf_matrix,
                                                                                         _id_type='item',
                                                                                         rec_num=5)
print('Content Based:')
display_recommendations(rec_ids, rec_names, message, rec_ids_users, rec_user_articles)

Content Based:
	Similar items for id:593, corresponding to Upload Files to IBM Data Science Experience Using the Command Line:

	- ID items: 593
	Name: Upload Files to IBM Data Science Experience Using the Command Line

	- ID items: 600
	Name: Access IBM Analytics for Apache Spark from RStudio

	- ID items: 809
	Name: Use the Machine Learning Library

	- ID items: 161
	Name: Use the Machine Learning Library in Spark

	- ID items: 893
	Name: Use the Machine Learning Library in IBM Analytics for Apache Spark



#### Simulate an error by passing a non existing item id

In [34]:
rec_ids, rec_names, message, rec_ids_users, rec_user_articles = rec.make_recommendations(_id=1087630,
                                                                                         dot_prod_user= dot_product_matrix_user,
                                                                                         tfidf_matrix=tfidf_matrix,
                                                                                         _id_type='item',
                                                                                         rec_num=5)


display_recommendations(rec_ids, rec_names, message, rec_ids_users, rec_user_articles)

We can't make recommendation for this item, please makesure the data was updated with this item.



##### Imagine that we have an article that we want to promote, but we want the top 10 users who may interested by this offer, the item id is 984.

In [35]:
user_item = user_item_df
def may_interested_by(item_id, top_n=10):
    pred = {}
    # iterate over each users and predict the rate it will give to this movie
    for user in user_item.index:
        pred[user] = rec.predict_rating(user_id=user, item_id=item_id)

    top_10_pairs = sorted(pred.items(), key=lambda x: x[1], reverse=True)[:top_n]
    top_10_user_ids = []

    for i in top_10_pairs:
        top_10_user_ids.append(i[0])

    return top_10_user_ids

print('These users may interested by this item:')
for i in may_interested_by(984, 10):
    print(f"- {i}")

These users may interested by this item:
- 4165
- 4184
- 674
- 2820
- 3652
- 1186
- 4255
- 3061
- 3331
- 182
