In [1]:
import pandas as pd

In [2]:
import numpy as np
import scipy
import math
import random

In [3]:
import importlib

In [4]:
import utils as u



In [5]:
import sklearn
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds

In [6]:
from gensim import models

In [7]:
import Recommenders as Rd

In [8]:
importlib.reload(Rd)

<module 'Recommenders' from 'C:\\Users\\HP\\Anaconda3\\Scripts\\Natural-Language-Processing\\Recommender-systems\\cleaned-version\\Recommenders.py'>

import matplotlib.pyplot as plt

## Explore Interaction Events Data

### -----------------------------------------------------------------------------------------------------------------------------------
Contains logs of user interactions on shared articles. It can be joined to articles_shared.csv by contentId column.

The eventType values are:

    VIEW: The user has opened the article.
    LIKE: The user has liked the article.
    COMMENT CREATED: The user created a comment in the article.
    FOLLOW: The user chose to be notified on any new comment in the article.
    BOOKMARK: The user has bookmarked the article for easy return in the future.
### -----------------------------------------------------------------------------------------------------------------------------------

In [9]:
interactions_df = pd.read_csv('users_interactions.csv')

In [10]:
articles_df = pd.read_csv('shared_articles.csv')

### Data Muning

### -----------------------------------------------------------------------------------------------------------------------------------
    Convert Event from categories to event strength :
    we associate them with a weight or strength, assuming that, for example, a comment in an article indicates a higher interest
    of the user on the item than a like, or than a simple view
### -----------------------------------------------------------------------------------------------------------------------------------

In [15]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}
# 

In [16]:
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']

In [17]:

interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x: event_type_strength[x])

In [18]:
df_inter = interactions_df[['contentId','personId','eventStrength']]

In [19]:
df_acrticles = articles_df[['contentId','title','text']]

In [20]:
merged = pd.merge(df_inter, df_acrticles,how='inner', on=['contentId'])

### Score Items popularity over user interactions

###### Just a test to show how to compute the popularity of an item based on the user interactions

In [21]:
articles_grouped = merged.groupby(['contentId']).agg({'eventStrength': 'count'}).reset_index()

In [22]:
grouped_sum = articles_grouped['eventStrength'].sum() # Get the sum of all counts
articles_grouped['percentage']  = articles_grouped['eventStrength'].div(grouped_sum)*100 # Score each articles
articles_grouped.sort_values(['eventStrength', 'contentId'], ascending = [0,1]) ; articles_grouped.head()

Unnamed: 0,contentId,eventStrength,percentage
0,-9222795471790223670,26,0.035977
1,-9216926795620865886,21,0.029058
2,-9194572880052200111,29,0.040128
3,-9192549002213406534,56,0.077488
4,-9190737901804729417,9,0.012453


### Number of Users and items 

In [23]:
users = merged['personId'].unique() ; print("# of users" , len(users))

# of users 1895


In [24]:
items = merged['contentId'].unique() ; print("# of items" , len(items))

# of items 2979


In [25]:
train_data, test_data = train_test_split(merged, test_size = 0.20, random_state=0)

# ------------------------------ The recommendation starts here ---------------------------------

### Recommend by Popularity Concept 

Recommend the items having more users-intercation

In [26]:
pm = Rd.popularity_recommender_py()
pm.create(train_data, 'personId', 'contentId')

In [27]:
user_id = users[5]
rec_list = pm.recommend(user_id) 
described = pd.merge(rec_list, df_acrticles,how='inner', on=['contentId']) ; described.head(10)

Unnamed: 0,personId,contentId,score,Rank,title,text
0,-3596626804281480007,-4029704725707465084,340,1.0,Former Google career coach shares a visual tri...,"If you want 2017 to be an exciting year, desig..."
1,-3596626804281480007,-133139342397538859,239,2.0,"Novo workaholic trabalha, pratica esportes e t...",Novo workaholic não abre mão do esporte e da f...
2,-3596626804281480007,8657408509986329668,237,3.0,Pull request first - Practical Blend,Pull request first After two years of working ...
3,-3596626804281480007,-6843047699859121724,230,4.0,"Ganhe 6 meses de acesso ao Pluralsight, maior ...","Ganhe 6 meses de acesso ao Pluralsight, maior ..."
4,-3596626804281480007,-6783772548752091658,229,5.0,Livro: Retrospectivas Divertidas,"Neste livro, nós fornecemos um conjunto de fer..."
5,-3596626804281480007,2857117417189640073,209,6.0,Running GV sprints inside corporates - learn f...,Running GV sprints inside corporates - learn f...
6,-3596626804281480007,-8208801367848627943,208,7.0,Ray Kurzweil: The world isn't getting worse - ...,"Ray Kurzweil, the author, inventor, computer s..."
7,-3596626804281480007,-2358756719610361882,208,8.0,Custo do Erro - Cinco motivos para investir em...,"Atualmente, o custo de manutenção de software ..."
8,-3596626804281480007,-1297580205670251233,206,9.0,A minha viagem à Maternidade #tetodomundo,"Já fazia uma semana, desde o dia 26 de dezembr..."
9,-3596626804281480007,3367026768872537336,202,10.0,Seja esperto no trabalho: Melhore a comunicaçã...,Seja Esperto no Trabalho: Melhore a Comunicaçã...


### Similarity -  Based Recommender (co-occurence matrix approach )

### Basic Steps of the recommender building : ----------------------------------------------------------------------------------------
    we compute the co-occurence matrix , number of times item x and item y are interacted by the same user : 
    - then we construct a matrix of size (n_of_user_items)x(all_items) 
    - we take the average of each column to compute relatedness scores of all items with user items matrix will be of size  (1,all_items)
    - Then we take the top 10 Best scores
### ------------------------------------------------------------------------------------------------------------------------------------------------------

In [28]:
is_model = Rd.item_similarity_recommender_py()
is_model.create(train_data, 'personId', 'contentId')

In [29]:
#Print the songs for the user in training data
user_id = users[5]
user_items = is_model.get_user_items(user_id)

#Recommend songs for the user using personalized model
rec_1 = is_model.recommend(user_id)

No. of unique songs for the user: 359
no. of unique songs in the training set: 2940
Non zero values in cooccurence_matrix :515958


In [30]:
rec_1

Unnamed: 0,personId,contentId,score,rank
0,-3.596627e+18,7.395436e+18,0.11336,1.0
1,-3.596627e+18,-5.778989e+18,0.10292,2.0
2,-3.596627e+18,-9.033212e+18,0.101333,3.0
3,-3.596627e+18,2.625884e+18,0.101162,4.0
4,-3.596627e+18,-2.118981e+18,0.098984,5.0
5,-3.596627e+18,-2.423593e+18,0.097792,6.0
6,-3.596627e+18,4.843782e+18,0.096652,7.0
7,-3.596627e+18,-1.55617e+18,0.094999,8.0
8,-3.596627e+18,5.237574e+18,0.09492,9.0
9,-3.596627e+18,5.314107e+18,0.094442,10.0


## Content Based Recommender-system

##### Cosine measure based similarity Extraction 

In [31]:
stopwords_list = stopwords.words('english') + stopwords.words('portuguese')

In [32]:
vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=5000,
                     stop_words=stopwords_list)

In [33]:
tfidf_matrix = vectorizer.fit_transform(train_data['title'] + "" + train_data['text'])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [34]:
tfidf_matrix.shape

(57815, 5000)

#### Test the model Content_based_Recommendation with TF-IDF

### Basic Steps of the recommender building : ----------------------------------------------------------------------------------------
    1 - we compute the tfidf_matrix from articles contents ( 'title' and 'text' ) : the frequency of a word in a doc we get a matrix with size (n_vocab , n_features = 5000 ) 
    2 - we compute the features of each article in the data-set we got a matrix (  n_articles , n_features )
    3 - Similarly , we compute the features of each article in the user items ( n_user_articles , n_features )
    4 - Compute the cosine similarity betwen user_features and all_articles_features we got a matrix of (n_user_articles , n_artilces)
    5 - Sum over user_articles ( we could weight this summation by the eventStregnth ( compute the interest of user to the item by summing its associated eventStrength to a given item ) this value may be used to rise score of interesting items but it still optional
    6 - we got score of all_items (1,n_articles) we got the then the best 20 items with best scores
### ------------------------------------------------------------------------------------------------------------------------------------------------------

In [35]:
importlib.reload(Rd)

<module 'Recommenders' from 'C:\\Users\\HP\\Anaconda3\\Scripts\\Natural-Language-Processing\\Recommender-systems\\cleaned-version\\Recommenders.py'>

In [36]:
cont_rec = Rd.content_relatedness_recommender_py()
cont_rec.create(train_data, 'personId', 'contentId')

In [37]:
cont_rec.set_matrix(tfidf_matrix)

In [38]:
df = cont_rec.recommend(user_id,20)

Getting Similar items ...
Done !


In [135]:
df

Unnamed: 0,title,contentId,score
49326,12 JavaScript Hacks,-1022885988494278200,0.46642
18143,Johnson & Johnson compra líder global em dermo...,2817960273225808857,0.46642
10192,Google lança teclado virtual para iPhone com r...,-5941854046477982982,0.46642
29299,Unknown NEO GEO MVS Fighting Game Discovered &...,-9007594455502730692,0.461462
69625,Hedron: Or my dissatisfaction with local devel...,8905656922810620012,0.461462
62827,Drupal 8 CI/CD with Docker via Jenkins. Part 1...,-91139000311163410,0.461462
48160,Empresas de tecnologia preparam mais demissões...,6873454157736399518,0.454523
19186,System Code Geeks are giving away a FREE Subli...,-986193709330758766,0.413921
71783,Concrete things you can do about your technica...,-5605799891597699962,0.409897
13407,Scalable Microservices with Kubernetes | Udacity,8963770574956550187,0.392571


### Test the model Content_based_Recommendation with Doc2vec

##### ----- Build the recommender 

### Basic Steps of the recommender building : ----------------------------------------------------------------------------------------
    It has the same principle as tfidf_features recommender here we just use an other feature given by doc2vec
   
    1 - we train the doc2vec model from articles contents ( 'title' and 'text' ) we got a matrix of (n_articles , n_features = 1000)
    2 - Compute the cosine similarity betwen user_features and all_articles_features we got a matrix of (n_user_articles , n_artilces)
    3 - Sum over user_articles ( we could weight this summation by the eventStregnth ( compute the interest of user to the item by summing its associated eventStrength to a given item ) this value may be used to rise score of interesting items but it still optional
    4 - we got score of all_items (1,n_articles) we got the then the best 20 items with best scores
### ------------------------------------------------------------------------------------------------------------------------------------------------------

In [81]:
importlib.reload(Rd)

<module 'Recommenders' from 'C:\\Users\\HP\\Anaconda3\\Scripts\\Natural-Language-Processing\\Recommender-systems\\Recommenders.py'>

In [82]:
doc_model = Rd.content_relatedness_doc2vec_py()

In [83]:
doc_model.create(train_data, 'personId', 'contentId')

###### ------Load the pre-trained Doc2Vec model

In [78]:
model = models.Doc2Vec.load("doc2vec_article_1000")

In [79]:
mat = doc_model.construct_matrix(model,user_id)

Start Building the similarity matrix
Done !


In [84]:
   # Run this code only if .construct_matrix() is not runned
doc_model.set_cosine_proximity(mat)
unique_train_data = train_data.drop_duplicates('contentId')
doc_model.doc_tag = unique_train_data['contentId'].tolist()

In [85]:
rec = doc_model.recommend(user_id,20) ; rec

Getting Similar items ...
Done !


Unnamed: 0,title,contentId,score
68367,"Docker - Build, Ship, and Run Any App, Anywhere",-1999468346928419252,0.54436
37815,Design Better Forms - uxdesign.cc - User Exper...,9079880752026843473,0.531616
49354,Gartner Reprint,-3959242148361340089,0.530856
66812,LPIC-OT DevOps Engineer | Linux Professional I...,-1572252285162838958,0.525188
20384,Google isn't abandoning Hangouts for its new c...,6045455576279706291,0.524121
33541,Bose's New Speaker Only Costs $150 But You Hav...,4485571547865864859,0.523987
18143,Johnson & Johnson compra líder global em dermo...,2817960273225808857,0.521496
21995,Senado lança enquete para saber se você é cont...,-4613284400780388067,0.520205
29058,Why It's Time to Take Google's PC Operating Sy...,8962537427807366481,0.517117
26476,Bolt teria recebido uns R$ 12 milhões por camp...,5462702203586546682,0.516342
