## Recommender Systems

Students: Irene Cantero (U151206) / Jian Chen (U150279)

This notebook contains the 4 algorithms requested in the project sentence + 1 algorithm chose by us.

Content:

- Alternate Least Squares (ALS)
- Adamic-Adar
- Personalized PageRank
- Node2Vec
- Doc2Vec (chose by us)

In [1]:
from search_engine.search_engine import SearchEngine
import networkx as nx
from networkx import Graph
from sklearn.model_selection import train_test_split
import implicit
import scipy.sparse as sparse
from scipy.sparse import csr_matrix
from fast_pagerank import pagerank
from fast_pagerank import pagerank_power
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import math
import igraph
import warnings
import csv
import argparse
import numpy as np
import networkx as nx
import node2vec
from gensim.models import Word2Vec
warnings.filterwarnings('ignore')

In [None]:
search_engine = SearchEngine()

Collection time: 0.0


In [None]:
#Create a graph where the vertices are formed by the users that retweet (users u) and the retweeted users (users v)
#And the edge is the connection of users u to users v
g=igraph.Graph()
for tweet in search_engine.tweets.iterrows():
    if str(tweet[1]['retweeted_status'])!='nan':
        u=tweet[1]['user']['screen_name']
        v=tweet[1]['retweeted_status']['user']['screen_name']
        g.add_vertices(u)
        g.add_vertices(v)
        g.add_edges([(u,v)])

In [None]:
#SELCET USER:
# If you change the user id, the recommendation of all 4 algorithms will try to satisfy that user.
user_id=0
user_name=g.vs[user_id]['name']

 ----------------------------------------------------------------------------------------------------------------------------

## ALS (Alternate Least Squares)

In [None]:
#Definition that converts the users id to user names
def number_to_username(alg_list: list, g) -> list:
    alg_list_transformed=[]
    for i in range(len(alg_list)):
        name=g.vs[int(alg_list[i][0])]['name']
        new_tuple=(name, alg_list[i][1])
        alg_list_transformed.append(new_tuple)
    return alg_list_transformed

#It returns the user recommendations given a user_id
def recommend_users(user_id: int, G:csr_matrix, g: igraph.Graph, top: int = 10) -> list:
    ALS_recommended_users=model.recommend(user_id, G, top)
    return ALS_recommended_users

In [None]:
#Create an adjency matrix from the graph
G = g.get_adjacency().data
#Convert the adjency matrix to csr_matrix, which is the variable type needed for doing ALS 
G = csr_matrix(G)

In [None]:
#Initialize ALS model
model = implicit.als.AlternatingLeastSquares(factors=10, iterations=5, calculate_training_loss=True)
#Train ALS model
model.fit(G)

In [None]:
#Get user ids recommendations for a user
ALS_recomdetaion_ids=recommend_users(user_id, G, g)
#Transform the user ids to their repective name
ALS_recomdetaion_names=number_to_username(ALS_recomdetaion_ids, g)
print(f"Recomendations for user {user_name} with id {user_id} using ALS (Alternate Least Squares):")
ALS_recomdetaion_names

 ----------------------------------------------------------------------------------------------------------------------------

## Adamic-Adar

In [None]:
#Implementation of Adamic-Adar algorithm
def get_recommendation_AA(username: int, g:igraph.Graph) -> pd.DataFrame:
    #We only need to consider those verices at distance 2
    #We take those users at distance 1
    neighbors_1=set(g.neighborhood(username, order=1))
    #We take those users at distance 1 & 2
    neighbors_2=set(g.neighborhood(username, order=2))
    #We take only those nodes that are at distance 2
    neighbors_only_order_2=list(neighbors_2 - neighbors_1)
    
    #Initialize dataframe with the user we want to recommend to as the column, and their 2-distance neighbors as indexes
    adamic_adar_data=pd.DataFrame(columns=[username], index=neighbors_only_order_2)
    #For every 2-distance users compute AA(x,y)
    for user_y in neighbors_only_order_2:
        if username!=user_y:
            #Get neighbors for the two nodes
            x_neighbors=set(g.neighbors(username))
            y_neighbors=set(g.neighborhood(user_y))
        #Get only those nodes that are neighbors of both nodes
        same_neighbors=x_neighbors&y_neighbors
        aa_val=0
        #Compute the Adamic-Avar value and add it to the dataframe
        for n in same_neighbors:
            num_neighbors=len(g.neighbors(n))
            aa_val+=(1/math.log(num_neighbors,10))
        adamic_adar_data[username][user_y]=aa_val
    #Sort values and return the top 10 recommendations
    top_n_recommendations_aa=adamic_adar_data[username].sort_values(ascending=False)
    aa_final_recommendation=pd.DataFrame(top_n_recommendations_aa)
    return aa_final_recommendation.head(10)

#Transform user ids to user names
def AA_num_to_name(dataset: pd.DataFrame, g: igraph.Graph)->pd.DataFrame:
    old_indices=list(dataset.index)
    new_indices=[]
    main_user_id=dataset.columns[0]
    main_user_name=g.vs[main_user_id]['name']
    for user_id in old_indices:
        name=g.vs[user_id]['name']
        new_indices.append(name)
    new_dataset=pd.DataFrame(dataset.values, columns=[main_user_name], index=new_indices)
    return new_dataset

In [None]:
#Get recommended users ids for the requested user 
AA_recommendation_ids=get_recommendation_AA(user_id, g)
#Transform the ids to usernames
AA_recommendation_names=AA_num_to_name(AA_recommendation_ids, g)
print(f"Recomendations for user {user_name} with id {user_id} using Adamic-Adar:")
AA_recommendation_names

----------------------------------------------------------------------------------------------------------------------------

## PageRank

In [None]:
#Transform user ids to user names
def pagerank_clearer(pagerank_values: list, g: igraph.Graph) -> list:
    pagerank=[]
    for i in range(len(pagerank_values)):
            user=pagerank_values[i][0]
            name=g.vs[user]['name']
            val=float(pagerank_values[i][1])
            pagerank.append((name, val))
    return pagerank

#From the score obtained from PageRank algorithm, get top user ids with higher score and that are 2-distance neighbors
def top_10_ids(pagerank_result: list, user_id: int)->list:
    pagerank_with_ids=[]
    
    #We only need to consider those verices at distance 2
    #We take those users at distance 1
    neighbors_1=set(g.neighborhood(user_id, order=1))
    #We take those users at distance 1 & 2
    neighbors_2=set(g.neighborhood(user_id, order=2))
    #We take only those nodes that are at distance 2
    neighbors_only_order_2=list(neighbors_2 - neighbors_1)
    
    for i in range(len(pagerank_result)):
        if i!=user_id and i in neighbors_only_order_2:
            pagerank_with_ids.append([i, pagerank_result[i]])
    pagerank_with_ids.sort(key = lambda x: x[1], reverse=True)
    pagerank_top_10_ids=pagerank_with_ids[0:10]
    return pagerank_top_10_ids

In [None]:
#Get PageRank scores for each user given the initial node/user
PageRank_recommendations=g.personalized_pagerank(directed=True, reset_vertices=user_id)
#Get top users ids with higher score and that are 2-distance neighbor 
PageRank_recommendation_ids=top_10_ids(PageRank_recommendations, user_id)
#Transform the ids to usernames
PageRank_recommendation_names=pagerank_clearer(PageRank_recommendation_ids, g)
print(f"Recomendations for user {user_name} with id {user_id} using Personalized PageRank:")
PageRank_recommendation_names

----------------------------------------------------------------------------------------------------------------------------

## Node2vec

In [None]:
#In order to apply Node2vec, we need to convert the igraph to a networkx graph
A = g.get_edgelist()
graph = nx.Graph(A) # In case your graph is directed

In [None]:
#Initialize Node2vec model
node2vec = node2vec.Node2Vec(graph, dimensions=64, walk_length=2, num_walks=200, workers=4) 
#Train Node2vec model
model = node2vec.fit(window=10, min_count=1, batch_words=4)

In [None]:
#Get a list of the recommended users with their respective scores of the initial/main node
user_id_str=str(user_id)
node2vec_recommendation_ids=model.wv.most_similar(user_id_str)

In [None]:
#Transform user ids to user names
node2vec_recommendation_names=number_to_username(node2vec_recommendation_ids, g)

In [None]:
print(f"Recomendations for user {user_name} with id {user_id} using Node2vec:")
node2vec_recommendation_names

----------------------------------------------------------------------------------------------------------------------------

## DOC2VEC

We wanted to exploit the text feature using Doc2Vec to recommend tweets based on the content of it. This was an opportunity for us to compare Word2Vec and Doc2Vec and see the pros and cons of both of them. So in this case, the nodes are not users but tweets, and based in the content a a tweet, we propose 10 tweets (Doc2Vec uses cosine similarity to do the recommendation, as well as Word2Vec).

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

# Initialization of Doc2Vec
def initialize_doc2vec(tweets: pd.DataFrame):
    tweets_ = []
    i = 0
    # Preparing the data to be put in Doc2Vec
    for line in tweets["original_text"]:
        tokens = simple_preprocess(line)
        tweets_.append(TaggedDocument(tokens, [i]))
        i += 1
    #Train the data and return
    d2v_model = Doc2Vec(documents=tweets_, vector_size=100, window=2, min_count=1, negative = 0, workers=4)
    return d2v_model

In [None]:
d2v_model = initialize_doc2vec(search_engine.tweets)

In [None]:
# This function returns the top 10 similar tweets related to a given tweet
def tweet2vec_top10(input_: str, tweets: pd.DataFrame):
    #Infer the vector to be able to let Doc2Vec do Cosine similarity
    embedded_input= d2v_model.infer_vector(input_.split())
    #Perform cosine similarity and return the top 10
    recommendations = d2v_model.docvecs.most_similar([embedded_input])
    #Preparing a DataFrame to return the top 10, by putting in the first row the input
    recommended_tweets = [input_]
    cosine_similarities = [1]
    results = pd.DataFrame(columns = ["Tweet", "Similarity"])
    # Preparing the list of the top 10
    for position, cos_similarity in recommendations:
        recommended_tweets.append(tweets["original_text"][position])
        cosine_similarities.append(str(cos_similarity))
    # Putting the lists in the DataFrame and returning
    results["Tweet"] = recommended_tweets
    results["Similarity"] = cosine_similarities
    return results

In [None]:
pd.set_option('display.max_colwidth', -1)
test = tweet2vec_top10(search_engine.tweets["original_text"][0], search_engine.tweets)
test