In [1]:
import pandas as pd
import numpy as np
import re
import recommender as r
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
nlp = spacy.load('en_core_web_sm')

%matplotlib inline

df = pd.read_csv('user-item-interactions.csv')
df_content = pd.read_csv('articles_community.csv')
del df['Unnamed: 0']
del df_content['Unnamed: 0']

df = df.fillna('no_email')

pre_data = dict(df.groupby(['email', 'article_id'])['article_id'].count())

list_email = []
list_article_id = []
list_nb_interactions = []

for key, val in pre_data.items():
    list_email.append(key[0])
    list_article_id.append(key[1])
    list_nb_interactions.append(val)
    
zipped_list = list(zip(list_email, list_article_id, list_nb_interactions))
interaction_user = pd.DataFrame(zipped_list, columns=['email','article_id','nb_interactions'])

df = pd.merge(df, interaction_user, on=['email', 'article_id'])

df_content = df_content.drop(labels='doc_status', axis=1)
df_content = df_content.drop_duplicates()

df = df.replace('no_email', np.nan)

def email_mapper():
    coded_dict = dict()
    cter = 1
    email_encoded = []
    
    for val in df['email']:
        if val not in coded_dict:
            coded_dict[val] = cter
            cter+=1
        
        email_encoded.append(coded_dict[val])
    return email_encoded

email_encoded = email_mapper()
del df['email']
df['user_id'] = email_encoded

dict_views = dict(df.article_id.value_counts())
df['popularity_article_nb_views'] = df.article_id.map(dict_views)


dict_interactions_total = dict(df.groupby('user_id')['article_id'].count())
df['nb_interactions_total'] = df.user_id.map(dict_interactions_total)

df['importance_article'] = df.nb_interactions / df.nb_interactions_total

# just for the module (later), I'll change the name doc_full_name to title
df_content['title'] = df_content['doc_full_name']
df_content = df_content.drop(labels='doc_full_name', axis=1)

# just for the module (later), I'll a column called date, set to 0
df['date'] = 0

IndentationError: unindent does not match any outer indentation level (recommender.py, line 96)

In [None]:
df.head()

In [None]:
df_content.head()

#### Engineering the feature Using NLP

In [2]:
def nlp_task(data):
    sent = data
    doc=nlp(sent)
    sub_toks = [tok for tok in doc if ((tok.dep_ == "compound") or (tok.dep_ == "dobj") or (tok.dep_ == "pobj")) ]
    insight_words = [str(i).lower() for i in sub_toks]
    return insight_words

df_content['insights'] = df_content.title.apply(nlp_task)
df_content.head()

NameError: name 'df_content' is not defined

##### Fill df_content with columns of insight

In [5]:
#concat all words
insight_sum = np.sum(df_content['insights'])

#normalize
insight_sum = [i.lower() for i in insight_sum]

#remove dupplicates
list_unique_insight = list(set(insight_sum))

#remove all non character from the list
regex = re.compile('[^a-zA-Z]')
list_unique_insight_clean = [regex.sub('non_charac', i) for i in list_unique_insight]
list_unique_insight_clean = set(list_unique_insight_clean)
list_unique_insight_clean.remove('non_charac')

#create a dataframe with the list as columns
df_temporary = pd.DataFrame(columns=list_unique_insight_clean)

df_content = pd.merge(df_content, df_temporary, how='outer', left_index=True, right_index=True)
del df_temporary

df_content.head()

Unnamed: 0,doc_body,doc_description,article_id,title,insights_x,long,services,loops,cachemachine,short,...,graph,hbnon_charac,dimensionality,dynamodb,alexa,pipe,key,skills,plug,video
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,0,Detect Malfunctioning IoT Sensors with Streami...,"[detect, malfunctioning, iot, streaming, analy...",,,,,,...,,,,,,,,,,
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",1,Communicating data science: A guide to present...,"[data, science, work]",,,,,,...,,,,,,,,,,
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,2,"This Week in Data Science (April 18, 2017)","[data, science]",,,,,,...,,,,,,,,,,
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,3,DataLayer Conference: Boost the performance of...,"[datalayer, performance, database]",,,,,,...,,,,,,,,,,
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,4,Analyze NY Restaurant data using Spark in DSX,"[analyze, ny, restaurant, spark, dsx]",,,,,,...,,,,,,,,,,


##### fill the columns added with 1 or 0 if the insight is present in the row

In [6]:
# iterate over the columns added and fill value
for i in list_unique_insight_clean:
    df_content[i] = df_content.insights_x.apply(lambda x: 1 if i in x else 0)

In [7]:
df_content.head(2)

Unnamed: 0,doc_body,doc_description,article_id,title,insights_x,long,services,loops,cachemachine,short,...,hbnon_charac,dimensionality,dynamodb,alexa,pipe,key,skills,plug,video,insights
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,0,Detect Malfunctioning IoT Sensors with Streami...,"[detect, malfunctioning, iot, streaming, analy...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",1,Communicating data science: A guide to present...,"[data, science, work]",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Remove some columns
df_content = df_content.drop(labels='insights_x', axis=1)
df = df.drop(labels=['nb_interactions', 'popularity_article_nb_views', 'nb_interactions_total'], axis=1)

#### Recommendations

In [9]:
rec = r.Recommender(df_items=df_content,
                    df_reviews=df,
                    item_name_colname='title',
                    user_id_colname='user_id',
                    item_id_colname='article_id',
                    rating_col_name='importance_article',
                    date_col_name='date')

In [10]:
import pickle
user_item_matrix = pd.read_pickle('user_item_matrix.p')

In [11]:
rec.fit(iters=10, ui_given=True, user_item=user_item_matrix)

AttributeError: 'Recommender' object has no attribute 'user_item_df'

In [156]:
print('prepare dot matrice for recommendations...')
def prep_get_similar_items():
    item_content = np.array(df_content.iloc[:,4:])
    item_content_transpose = np.transpose(item_content)
    dot_prod = item_content.dot(item_content_transpose)
    return dot_prod

dot_product_matrix = prep_get_similar_items()
print('Ok')

prepare dot matrice for recommendations...
Ok


In [157]:
def display_recommendations(rec_ids, rec_names, message):
    dict_id_name = dict(zip(rec_ids, rec_names))

    print(f"{message}")

    for key, val  in dict_id_name.items():
        print(f"ID items: {key}")
        print(f"Name: {val}\n")

***
Existing user

In [158]:
rec_ids, rec_names, message = rec.make_recommendations(_id=1,
                                                       dot_prod=dot_product_matrix,
                                                       _id_type='user',
                                                       rec_num=5)

In [159]:
display_recommendations(rec_ids, rec_names, message)

Glad to see you again! recommended for you:

ID items: 420.0
Name: CACHE Table in Apache Spark SQL

ID items: 1080.0
Name: Create a project for Watson Machine Learning in DSX

ID items: 655.0
Name: Machine Learning for everyone



***
New User

In [160]:
rec_ids, rec_names, message = rec.make_recommendations(_id=8000,
                                                       dot_prod=dot_product_matrix,
                                                       _id_type='user',
                                                       rec_num=5)

In [161]:
display_recommendations(rec_ids, rec_names, message)

Hey, you are new here, this is for you:

ID items: 256.0
Name: Apache Spark @Scale: A 60 TB+ production use case

ID items: 233.0
Name: Improving quality of life with Spark-empowered machine learning

ID items: 277.0
Name: Bayesian Nonparametric Models – Stats and Bots

ID items: 96.0
Name: xml2 1.0.0

ID items: 77.0
Name: Work with Data Connections in DSX



****
Existing movies (find similar)

In [162]:
rec_ids, rec_names, message = rec.make_recommendations(_id=12,
                                                       dot_prod=dot_product_matrix,
                                                       _id_type='item',
                                                       rec_num=5)

In [163]:
display_recommendations(rec_ids, rec_names, message)

Similar items for id:12, corresponding to Timeseries Data Analysis of IoT events by using Jupyter Notebook:



In [164]:
rec.user_item_df.shape

(5149, 714)