In [1]:
import pandas as pd
import numpy as np
import re
import recommender as r
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
nlp = spacy.load('en_core_web_sm')

%matplotlib inline

df = pd.read_csv('user-item-interactions.csv')
df_content = pd.read_csv('articles_community.csv')
del df['Unnamed: 0']
del df_content['Unnamed: 0']

df = df.fillna('no_email')

# make sure all articles from df are in df_content
df = pd.merge(df, df_content[['article_id']], on='article_id', how='inner')

pre_data = dict(df.groupby(['email', 'article_id'])['article_id'].count())

list_email = []
list_article_id = []
list_nb_interactions = []

for key, val in pre_data.items():
    list_email.append(key[0])
    list_article_id.append(key[1])
    list_nb_interactions.append(val)
    
zipped_list = list(zip(list_email, list_article_id, list_nb_interactions))
interaction_user = pd.DataFrame(zipped_list, columns=['email','article_id','nb_interactions'])

df = pd.merge(df, interaction_user, on=['email', 'article_id'])

df_content = df_content.drop(labels='doc_status', axis=1)
df_content = df_content.drop_duplicates()

df = df.replace('no_email', np.nan)

def email_mapper():
    coded_dict = dict()
    cter = 1
    email_encoded = []
    
    for val in df['email']:
        if val not in coded_dict:
            coded_dict[val] = cter
            cter+=1
        
        email_encoded.append(coded_dict[val])
    return email_encoded

email_encoded = email_mapper()
del df['email']
df['user_id'] = email_encoded

#dict_views = dict(df.article_id.value_counts())
#df['popularity_article_nb_views'] = df.article_id.map(dict_views)


dict_interactions_total = dict(df.groupby('user_id')['article_id'].count())
df['nb_interactions_total'] = df.user_id.map(dict_interactions_total)

df['importance_article'] = np.where(df.nb_interactions_total > 19,
                                    df.nb_interactions/df.nb_interactions_total,
                                    np.nan)

# just for the module (later), I'll change the name doc_full_name to title
df_content['title'] = df_content['doc_full_name']
df_content = df_content.drop(labels='doc_full_name', axis=1)

# just for the module (later), I'll a column called date, set to 0
df['date'] = 0

df.article_id = df.article_id.astype('int64')
df_content.article_id = df_content.article_id.astype('int64')

In [2]:
df.head()

Unnamed: 0,article_id,title,nb_interactions,user_id,nb_interactions_total,importance_article,date
0,593,upload files to ibm data science experience us...,1,1,40,0.025,0
1,593,upload files to ibm data science experience us...,1,2,11,,0
2,593,upload files to ibm data science experience us...,2,3,3,,0
3,593,upload files to ibm data science experience us...,2,3,3,,0
4,593,upload files to ibm data science experience us...,1,4,1,,0


In [3]:
df_content.head()

Unnamed: 0,doc_body,doc_description,article_id,title
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,0,Detect Malfunctioning IoT Sensors with Streami...
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",1,Communicating data science: A guide to present...
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,2,"This Week in Data Science (April 18, 2017)"
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,3,DataLayer Conference: Boost the performance of...
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,4,Analyze NY Restaurant data using Spark in DSX


#### Engineering the feature Using NLP

In [4]:
def nlp_task(data):
    sent = data
    doc=nlp(sent)
    sub_toks = [tok for tok in doc if ((tok.dep_ == "compound") or (tok.dep_ == "dobj") or (tok.dep_ == "pobj")) ]
    insight_words = [str(i).lower() for i in sub_toks]
    insight_words = ','.join(insight_words)
    return insight_words

df_content['insights'] = df_content.title.apply(nlp_task)
df_content.head()

Unnamed: 0,doc_body,doc_description,article_id,title,insights
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,0,Detect Malfunctioning IoT Sensors with Streami...,"detect,malfunctioning,iot,streaming,analytics"
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",1,Communicating data science: A guide to present...,"data,science,work"
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,2,"This Week in Data Science (April 18, 2017)","data,science"
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,3,DataLayer Conference: Boost the performance of...,"datalayer,performance,database"
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,4,Analyze NY Restaurant data using Spark in DSX,"analyze,ny,restaurant,spark,dsx"


##### Create list of insights

In [5]:
insights_list = []

for i in df_content.insights:
    try:
        insights_list.extend(i.split(','))
    except AttributeError:
        pass

#normalize
insights_list = [i.lower() for i in insights_list]

#remove all non character from the list
regex = re.compile('[^a-zA-Z]')
insights_list = [regex.sub('non_charac', i) for i in insights_list]
insights_list = set(insights_list)
insights_list.remove('non_charac')

##### Fill df_content with columns of insight

In [6]:
def split_insights(val):
    try:
        if val.find(insight) > -1:
            return 1
        else:
            return 0
    except AttributeError:
        return 0
    
for insight in insights_list:
    df_content[insight] = df_content['insights'].apply(split_insights)

In [7]:
df_content.head()

Unnamed: 0,doc_body,doc_description,article_id,title,insights,cloudflare,Unnamed: 7,prem,ods,zapier,...,study,envoy,flight,impressive,containers,documents,leverage,amazing,engagement,entity
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,0,Detect Malfunctioning IoT Sensors with Streami...,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",1,Communicating data science: A guide to present...,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,2,"This Week in Data Science (April 18, 2017)",0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,3,DataLayer Conference: Boost the performance of...,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,4,Analyze NY Restaurant data using Spark in DSX,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Remove some columns
df_content = df_content.drop(labels=['insights', 'doc_body', 'doc_description', ''], axis=1)
df = df.drop(labels=['nb_interactions', 'nb_interactions_total'], axis=1)

#### Recommendations

In [9]:
rec = r.Recommender(df_items=df_content,
                    df_reviews=df,
                    item_name_colname='title',
                    user_id_colname='user_id',
                    item_id_colname='article_id',
                    rating_col_name='importance_article',
                    date_col_name='date')

In [10]:
rec.fit(iters=10, latent_features=10)

Create User-Item matrix...
Train data with Funk Sigular Value Decomposition...
Iterations 		 Mean Squared Error 
	1 		 6.398577980221514 
	2 		 6.05508220795138 
	3 		 5.740668637572722 
	4 		 5.4520068695821555 
	5 		 5.186242355343206 
	6 		 4.940915639852629 
	7 		 4.713897355547736 
	8 		 4.50333551861026 
	9 		 4.307612512021063 
	10 		 4.1253097535685574 


In [11]:
print('prepare dot matrice for recommendations...')

def prep_get_similar_items():
    item_content = np.array(df_content.iloc[:,2:])
    item_content_transpose = np.transpose(item_content)
    dot_prod = item_content.dot(item_content_transpose)
    return dot_prod

dot_product_matrix = prep_get_similar_items()
print('Ok')

prepare dot matrice for recommendations...
Ok


In [12]:
def display_recommendations(rec_ids, rec_names, message):
    dict_id_name = dict(zip(rec_ids, rec_names))

    print(f"{message}")

    for key, val  in dict_id_name.items():
        print(f"ID items: {key}")
        print(f"Name: {val}\n")

***
Existing user

In [13]:
rec_ids, rec_names, message = rec.make_recommendations(_id=1,
                                                       dot_prod=dot_product_matrix,
                                                       _id_type='user',
                                                       rec_num=5)

In [14]:
display_recommendations(rec_ids, rec_names, message)

Glad to see you again! recommended for you:

ID items: 36
Name: Data visualization playbook: The right level of detail

ID items: 195
Name: Advancements in the Spark Community

ID items: 399
Name: Artificial Intelligence, Ethically Speaking – Inside Machine learning – Medium

ID items: 58
Name: Predicting The 2016 US Presidential Election

ID items: 727
Name: From Python Nested Lists to Multidimensional numpy Arrays



***
New User

In [15]:
rec_ids, rec_names, message = rec.make_recommendations(_id=8000,
                                                       dot_prod=dot_product_matrix,
                                                       _id_type='user',
                                                       rec_num=5)

In [16]:
display_recommendations(rec_ids, rec_names, message)

Hey, you are new here, this is for you:

ID items: 251
Name: Data science expert interview: Dez Blanchfield, Craig Brown, David Mathison, Jennifer Shin and Mike Tamir part 2

ID items: 1018
Name: Clustering: A Guide for the Perplexed

ID items: 977
Name: Rapidly build Machine Learning flows with DSX

ID items: 366
Name: Apache Spark as the New Engine of Genomics

ID items: 732
Name: 7 types of job profiles that makes you a Data Scientist



****
Existing items (find similar)

In [34]:
rec_ids, rec_names, message = rec.make_recommendations(_id=100,
                                                       dot_prod=dot_product_matrix,
                                                       _id_type='item',
                                                       rec_num=5, window=1)

In [35]:
display_recommendations(rec_ids, rec_names, message)

Similar items for id:100, corresponding to Use data assets in a project using IBM Data Catalog:

ID items: 0
Name: Detect Malfunctioning IoT Sensors with Streaming Analytics

ID items: 5
Name: Browsing PostgreSQL Data with Compose

ID items: 6
Name: Upgrading your PostgreSQL to 9.5

ID items: 11
Name: Warehousing GeoJSON documents

ID items: 15
Name: Apache Spark™ 2.0: Extend Structured Streaming for Spark ML

