In [92]:
import pandas as pd
import numpy as np
import re
import recommender as r
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
nlp = spacy.load('en_core_web_sm')

%matplotlib inline

df = pd.read_csv('user-item-interactions.csv')
df_content = pd.read_csv('articles_community.csv')
del df['Unnamed: 0']
del df_content['Unnamed: 0']

df = df.fillna('no_email')

pre_data = dict(df.groupby(['email', 'article_id'])['article_id'].count())

list_email = []
list_article_id = []
list_nb_interactions = []

for key, val in pre_data.items():
    list_email.append(key[0])
    list_article_id.append(key[1])
    list_nb_interactions.append(val)
    
zipped_list = list(zip(list_email, list_article_id, list_nb_interactions))
interaction_user = pd.DataFrame(zipped_list, columns=['email','article_id','nb_interactions'])

df = pd.merge(df, interaction_user, on=['email', 'article_id'])

df_content = df_content.drop(labels='doc_status', axis=1)
df_content = df_content.drop_duplicates()

df = df.replace('no_email', np.nan)

def email_mapper():
    coded_dict = dict()
    cter = 1
    email_encoded = []
    
    for val in df['email']:
        if val not in coded_dict:
            coded_dict[val] = cter
            cter+=1
        
        email_encoded.append(coded_dict[val])
    return email_encoded

email_encoded = email_mapper()
del df['email']
df['user_id'] = email_encoded

#dict_views = dict(df.article_id.value_counts())
#df['popularity_article_nb_views'] = df.article_id.map(dict_views)


dict_interactions_total = dict(df.groupby('user_id')['article_id'].count())
df['nb_interactions_total'] = df.user_id.map(dict_interactions_total)

df['importance_article'] = np.where(df.nb_interactions_total > 19,
                                    df.nb_interactions/df.nb_interactions_total,
                                    np.nan)

# just for the module (later), I'll change the name doc_full_name to title
df_content['title'] = df_content['doc_full_name']
df_content = df_content.drop(labels='doc_full_name', axis=1)

# just for the module (later), I'll a column called date, set to 0
df['date'] = 0

In [93]:
df.head()

Unnamed: 0,article_id,title,nb_interactions,user_id,nb_interactions_total,importance_article,date
0,1430.0,"using pixiedust for fast, flexible, and easier...",2,1,47,0.042553,0
1,1430.0,"using pixiedust for fast, flexible, and easier...",2,1,47,0.042553,0
2,1314.0,healthcare python streaming application demo,1,2,6,,0
3,1429.0,use deep learning for image classification,25,3,82,0.304878,0
4,1429.0,use deep learning for image classification,25,3,82,0.304878,0


In [94]:
df_content.head()

Unnamed: 0,doc_body,doc_description,article_id,title
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,0,Detect Malfunctioning IoT Sensors with Streami...
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",1,Communicating data science: A guide to present...
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,2,"This Week in Data Science (April 18, 2017)"
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,3,DataLayer Conference: Boost the performance of...
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,4,Analyze NY Restaurant data using Spark in DSX


#### Engineering the feature Using NLP

In [95]:
def nlp_task(data):
    sent = data
    doc=nlp(sent)
    sub_toks = [tok for tok in doc if ((tok.dep_ == "compound") or (tok.dep_ == "dobj") or (tok.dep_ == "pobj")) ]
    insight_words = [str(i).lower() for i in sub_toks]
    return insight_words

df_content['insights'] = df_content.title.apply(nlp_task)
df_content.head()

Unnamed: 0,doc_body,doc_description,article_id,title,insights
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,0,Detect Malfunctioning IoT Sensors with Streami...,"[detect, malfunctioning, iot, streaming, analy..."
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",1,Communicating data science: A guide to present...,"[data, science, work]"
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,2,"This Week in Data Science (April 18, 2017)","[data, science]"
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,3,DataLayer Conference: Boost the performance of...,"[datalayer, performance, database]"
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,4,Analyze NY Restaurant data using Spark in DSX,"[analyze, ny, restaurant, spark, dsx]"


##### Fill df_content with columns of insight

In [96]:
#concat all words
insight_sum = np.sum(df_content['insights'])

#normalize
insight_sum = [i.lower() for i in insight_sum]

#remove dupplicates
list_unique_insight = list(set(insight_sum))

#remove all non character from the list
regex = re.compile('[^a-zA-Z]')
list_unique_insight_clean = [regex.sub('non_charac', i) for i in list_unique_insight]
list_unique_insight_clean = set(list_unique_insight_clean)
list_unique_insight_clean.remove('non_charac')

#create a dataframe with the list as columns
df_temporary = pd.DataFrame(columns=list_unique_insight_clean)

df_content = pd.merge(df_content, df_temporary, how='outer', left_index=True, right_index=True)
del df_temporary

df_content.head()

Unnamed: 0,doc_body,doc_description,article_id,title,insights_x,insights_y,support,rogue,restaurant,alternative,...,shiny,use,rate,graph,know,projects,starcraft,palette,hybrid,workers
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,0,Detect Malfunctioning IoT Sensors with Streami...,"[detect, malfunctioning, iot, streaming, analy...",,,,,,...,,,,,,,,,,
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",1,Communicating data science: A guide to present...,"[data, science, work]",,,,,,...,,,,,,,,,,
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,2,"This Week in Data Science (April 18, 2017)","[data, science]",,,,,,...,,,,,,,,,,
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,3,DataLayer Conference: Boost the performance of...,"[datalayer, performance, database]",,,,,,...,,,,,,,,,,
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,4,Analyze NY Restaurant data using Spark in DSX,"[analyze, ny, restaurant, spark, dsx]",,,,,,...,,,,,,,,,,


##### fill the columns added with 1 or 0 if the insight is present in the row

In [97]:
# iterate over the columns added and fill value
for i in list_unique_insight_clean:
    df_content[i] = df_content.insights_x.apply(lambda x: 1 if i in x else 0)

df_content.head(2)

Unnamed: 0,doc_body,doc_description,article_id,title,insights_x,insights_y,support,rogue,restaurant,alternative,...,use,rate,graph,know,projects,starcraft,palette,hybrid,workers,insights
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,0,Detect Malfunctioning IoT Sensors with Streami...,"[detect, malfunctioning, iot, streaming, analy...",,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",1,Communicating data science: A guide to present...,"[data, science, work]",,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
# Remove some columns
df_content = df_content.drop(labels=['insights_x', 'insights_y', 'doc_body', 'doc_description'], axis=1)
df = df.drop(labels=['nb_interactions', 'nb_interactions_total'], axis=1)

In [99]:
df.shape

(45993, 5)

In [100]:
df = pd.merge(df, df_content[['article_id']], on='article_id', how='inner')

df.head()

Unnamed: 0,article_id,title,user_id,importance_article,date
0,593.0,upload files to ibm data science experience us...,8,0.012195,0
1,593.0,upload files to ibm data science experience us...,22,0.027027,0
2,593.0,upload files to ibm data science experience us...,78,,0
3,593.0,upload files to ibm data science experience us...,78,,0
4,593.0,upload files to ibm data science experience us...,101,,0


#### Recommendations

In [105]:
rec = r.Recommender(df_items=df_content,
                    df_reviews=df,
                    item_name_colname='title',
                    user_id_colname='user_id',
                    item_id_colname='article_id',
                    rating_col_name='importance_article',
                    date_col_name='date')

In [106]:
rec.fit(iters=20)

Create User-Item matrix...
Train data with Funk Sigular Value Decomposition...
Iterations 		 Mean Squared Error 
	1 		 8.783801824742838 
	2 		 8.16268888860911 
	3 		 7.611825229176933 
	4 		 7.120540704123255 
	5 		 6.680167953995747 
	6 		 6.283600166161919 
	7 		 5.9249600418088555 
	8 		 5.599348844016479 
	9 		 5.302653936243963 
	10 		 5.031399598706146 
	11 		 4.7826302504701514 
	12 		 4.553818204635548 
	13 		 4.3427901865404905 
	14 		 4.14766833828146 
	15 		 3.9668225064782905 
	16 		 3.7988313909181017 
	17 		 3.642450705435542 
	18 		 3.496586928191558 
	19 		 3.3602755374806033 
	20 		 3.232662870215372 


In [107]:
print('prepare dot matrice for recommendations...')

def prep_get_similar_items():
    item_content = np.array(df_content.iloc[:,2:])
    item_content_transpose = np.transpose(item_content)
    dot_prod = item_content.dot(item_content_transpose)
    return dot_prod

dot_product_matrix = prep_get_similar_items()
print('Ok')

prepare dot matrice for recommendations...
Ok


In [108]:
def display_recommendations(rec_ids, rec_names, message):
    dict_id_name = dict(zip(rec_ids, rec_names))

    print(f"{message}")

    for key, val  in dict_id_name.items():
        print(f"ID items: {key}")
        print(f"Name: {val}\n")

***
Existing user

In [109]:
rec_ids, rec_names, message = rec.make_recommendations(_id=1,
                                                       dot_prod=dot_product_matrix,
                                                       _id_type='user',
                                                       rec_num=5)

In [110]:
display_recommendations(rec_ids, rec_names, message)

Glad to see you again! recommended for you:

ID items: 233.0
Name: Bayesian Nonparametric Models – Stats and Bots

ID items: 244.0
Name: Notebooks: A power tool for data scientists

ID items: 1004.0
Name: This Week in Data Science

ID items: 679.0
Name: Interactive time series with dygraphs

ID items: 785.0
Name: How to Get a Job In Deep Learning



***
New User

In [111]:
rec_ids, rec_names, message = rec.make_recommendations(_id=8000,
                                                       dot_prod=dot_product_matrix,
                                                       _id_type='user',
                                                       rec_num=5)

In [112]:
display_recommendations(rec_ids, rec_names, message)

Hey, you are new here, this is for you:

ID items: 251.0
Name: Using Notebooks with PixieDust for Fast, Flexible, and Easier Data Analysis and Experimentation

ID items: 686.0
Name: Data science expert interview: Dez Blanchfield, Craig Brown, David Mathison, Jennifer Shin and Mike Tamir part 2

ID items: 1018.0
Name: Score a Predictive Model Built with IBM SPSS Modeler, WML & DSX

ID items: 108.0
Name: How to use version control (GitHub) in RStudio within DSX?

ID items: 930.0
Name: 7 types of job profiles that makes you a Data Scientist



****
Existing items (find similar)

In [113]:
rec_ids, rec_names, message = rec.make_recommendations(_id=100,
                                                       dot_prod=dot_product_matrix,
                                                       _id_type='item',
                                                       rec_num=5)

In [114]:
display_recommendations(rec_ids, rec_names, message)

Similar items for id:100, corresponding to Use data assets in a project using IBM Data Catalog:

ID items: 100
Name: Use data assets in a project using IBM Data Catalog



In [118]:
def find_similar_items(item_id, user_item=rec.user_item_df.T):
    '''
    INPUT:
    user_id - (int) a user_id
    user_item - (pandas dataframe) matrix of users by articles: 
                1's when a user has interacted with an article, 0 otherwise
    
    OUTPUT:
    similar_users - (list) an ordered list where the closest users (largest dot product users)
                    are listed first
    
    Description:
    Computes the similarity of every pair of users based on the dot product
    Returns an ordered
    
    '''
    similarity = dict()
    
    # compute similarity of each user to the provided user
    n_item = user_item.shape[0]
    for i in range(1, n_item):
        similarity[i] = np.dot(user_item.loc[item_id], user_item.loc[i])

    # sort by similarity
    similarity = sorted(similarity.items(), key=lambda x: x[1], reverse=True)

    # create list of just the ids
    most_similar_items = [i[0] for i in similarity]
   
    # remove the own user's id
    most_similar_items = [i for i in most_similar_items if i != item_id]
       
    return most_similar_items # return a list of the users in order from most to least similar
        