In [369]:
from search_engine.search_engine import SearchEngine
import networkx as nx
from networkx import Graph
from sklearn.model_selection import train_test_split
import implicit
import scipy.sparse as sparse
from scipy.sparse import csr_matrix
from fast_pagerank import pagerank
from fast_pagerank import pagerank_power
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import math
import igraph
import warnings
import csv
import argparse
import numpy as np
import networkx as nx
import node2vec
from gensim.models import Word2Vec
warnings.filterwarnings('ignore')

In [None]:
search_engine = SearchEngine()

Collection time: 0.0


In [None]:
#Create a graph where the vertices are formed by the users that retweet (users u) and the retweeted users (users v)
#And the edge is the connection of users u to users v
g=igraph.Graph()
for tweet in search_engine.tweets.iterrows():
    if str(tweet[1]['retweeted_status'])!='nan':
        u=tweet[1]['user']['screen_name']
        v=tweet[1]['retweeted_status']['user']['screen_name']
        g.add_vertices(u)
        g.add_vertices(v)
        g.add_edges([(u,v)])

## *Recomender Systems*

In [None]:
#SELCET USER ID:
user_id=0
user_name=g.vs[user_id]['name']

In [None]:
#These two definitions are taken from the lab sessions of the subject
def find_nodes_at_distance_2(graph):

    all_potential_recommendations = set()
    
    for n1 in graph.vs:
        
        # all the nodes at distance 1
        nodes_at_most_distant_1 = set(graph.neighborhood(n1, order=1))

        # all the nodes at distance 1 and distance 2

        nodes_at_most_distant_2 = set(graph.neighborhood(n1, order=2))

        # only the nodes at distance 2
        only_nodes_at_distance_2 = nodes_at_most_distant_2 - nodes_at_most_distant_1
        
        
        # check if empty set
        if len(nodes_at_most_distant_2) > 0:
            for n2 in nodes_at_most_distant_2:
                
                # since n1 is an igraph vertex object, we need to extract the id
                n1_index = n1.index
                if n1_index!=n2:
                    all_potential_recommendations.add((n1_index, n2))
            
    return all_potential_recommendations

def predict_ALS(testset, model):
    # initialize the empty list
    all_predictions = []

    # scroll the obs
    for n1,n2, w in testset:
        
        # take here the low-dimensional vectors returned by the matrix factorization
        print(n1)
        array_n1 = model.user_factors[n1,:]
        array_n2 = model.item_factors[n2,:]

        # multiplying these vectors we generate an approximation for the edge score
        one_p = np.dot(array_n1, array_n2)

        all_predictions.append(one_p)
        
    return all_predictions

In [None]:
#Code taken from the lab sessions of the subject
edges_max_distance_2=find_nodes_at_distance_2(g)

# fraction of edges to select as test-set
p = 0.2

# graphsize
N = len(g.es)

# idxs of all the edges
all_idxs = range(N)

# sample idxs of edges through the function "choice"
test_idxs = np.random.choice(a=all_idxs, size=int(p*N),replace=False)

## ALS (Alternate Least Squares)

In [None]:
#Definition that converts the users id to user names
def number_to_username(alg_list: list, g) -> list:
    alg_list_transformed=[]
    for i in range(len(alg_list)):
        name=g.vs[int(alg_list[i][0])]['name']
        new_tuple=(name, alg_list[i][1])
        alg_list_transformed.append(new_tuple)
    return alg_list_transformed

#It returns the user recommendations given a user_id
def recommend_users(user_id: int, G:csr_matrix, g: igraph.Graph, top: int = 10) -> list:
    ALS_recommended_users=model.recommend(user_id, G, top)
    #ALS_recommended_users_with_names=number_to_username(ALS_recommended_users, g)
    return ALS_recommended_users

In [None]:
#Code taken from the lab sessions of the subject
ground_truth = set()
trainset = set()
for idx, one_edge in enumerate(g.es):
    # take n1 and n2 idx from one_edge, that is an igraph edge *object*
    n1 = one_edge.source
    n2 = one_edge.target
    if idx in test_idxs:
        ground_truth.add((n1, n2, 1))
    else:
        trainset.add((n1, n2, 1))
        
for rec in edges_max_distance_2:
    
    # add to ground truth also the potential nodes
    n1 = rec[0]
    n2 = rec[1]
    
    ground_truth.add((n1,n2,0))

In [None]:
#Create an adjency matrix from the graph
G = g.get_adjacency().data
#Convert the adjency matrix to csr_matrix, which is the variable type needed for doing ALS 
G = csr_matrix(G)

In [None]:
#Initialize ALS model
model = implicit.als.AlternatingLeastSquares(factors=10, iterations=5, calculate_training_loss=True)
#Train ALS model
model.fit(G)

In [None]:
#Get user ids recommendations for a user
ALS_recomdetaion_ids=recommend_users(user_id, G, g)
#Transform the user ids to their repective name
ALS_recomdetaion_names=number_to_username(ALS_recomdetaion_ids, g)
print(f"Recomendations for user {user_name} with id {user_id} using ALS (Alternate Least Squares):")
ALS_recomdetaion_names

 ----------------------------------------------------------------------------------------------------------------------------

## Adamic-Adar

In [55]:
#Implementation of Adamic-Adar algorithm
def get_recommendation_AA(username: int, g:igraph.Graph) -> pd.DataFrame:
    #We only need to consider those verices at distance 2
    #We take those users at distance 1
    neighbors_1=set(g.neighborhood(username, order=1))
    #We take those users at distance 1 & 2
    neighbors_2=set(g.neighborhood(username, order=2))
    #We take only those nodes that are at distance 2
    neighbors_only_order_2=list(neighbors_2 - neighbors_1)
    
    #Initialize dataframe with the user we want to recommend to as the column, and their 2-distance neighbors as indexes
    adamic_adar_data=pd.DataFrame(columns=[username], index=neighbors_only_order_2)
    #For every 2-distance users compute AA(x,y)
    for user_y in neighbors_only_order_2:
        if username!=user_y:
            #Get neighbors for the two nodes
            x_neighbors=set(g.neighbors(username))
            y_neighbors=set(g.neighborhood(user_y))
        #Get only those nodes that are neighbors of both nodes
        same_neighbors=x_neighbors&y_neighbors
        aa_val=0
        #Compute the Adamic-Avar value and add it to the dataframe
        for n in same_neighbors:
            num_neighbors=len(g.neighbors(n))
            aa_val+=(1/math.log(num_neighbors,10))
        adamic_adar_data[username][user_y]=aa_val
    #Sort values and return the top 10 recommendations
    top_n_recommendations_aa=adamic_adar_data[username].sort_values(ascending=False)
    aa_final_recommendation=pd.DataFrame(top_n_recommendations_aa)
    return aa_final_recommendation.head(10)

#Transform user ids to user names
def AA_num_to_name(dataset: pd.DataFrame, g: igraph.Graph)->pd.DataFrame:
    old_indices=list(dataset.index)
    new_indices=[]
    main_user_id=dataset.columns[0]
    main_user_name=g.vs[main_user_id]['name']
    for user_id in old_indices:
        name=g.vs[user_id]['name']
        new_indices.append(name)
    new_dataset=pd.DataFrame(dataset.values, columns=[main_user_name], index=new_indices)
    return new_dataset

In [56]:
#Get recommended users ids for the requested user 
AA_recommendation_ids=get_recommendation_AA(user_id, g)
#Transform the ids to usernames
AA_recommendation_names=AA_num_to_name(AA_recommendation_ids, g)
print(f"Recomendations for user {user_name} with id {user_id} using Adamic-Adar:")
AA_recommendation_names

Unnamed: 0,Liensevi
N30Foll0w,1.43068
FloydStad,1.43068
kasmouse,1.43068
dennis91842840,1.43068


----------------------------------------------------------------------------------------------------------------------------

## PageRank

In [57]:
#Transform user ids to user names
def pagerank_clearer(pagerank_values: list, g: igraph.Graph) -> list:
    pagerank=[]
    for i in range(len(pagerank_values)):
            user=pagerank_values[i][0]
            name=g.vs[user]['name']
            val=float(pagerank_values[i][1])
            pagerank.append((name, val))
    return pagerank

#From the score obtained from PageRank algorithm, get top user ids with higher score and that are 2-distance neighbors
def top_10_ids(pagerank_result: list, user_id: int)->list:
    pagerank_with_ids=[]
    
    #We only need to consider those verices at distance 2
    #We take those users at distance 1
    neighbors_1=set(g.neighborhood(user_id, order=1))
    #We take those users at distance 1 & 2
    neighbors_2=set(g.neighborhood(user_id, order=2))
    #We take only those nodes that are at distance 2
    neighbors_only_order_2=list(neighbors_2 - neighbors_1)
    
    for i in range(len(pagerank_result)):
        if i!=user_id and i in neighbors_only_order_2:
            pagerank_with_ids.append([i, pagerank_result[i]])
    pagerank_with_ids.sort(key = lambda x: x[1], reverse=True)
    pagerank_top_10_ids=pagerank_with_ids[0:10]
    return pagerank_top_10_ids

In [58]:
#Get PageRank scores for each user given the initial node/user
PageRank_recommendations=g.personalized_pagerank(directed=True, reset_vertices=user_id)
#Get top users ids with higher score and that are 2-distance neighbor 
PageRank_recommendation_ids=top_10_ids(PageRank_recommendations, user_id)
#Transform the ids to usernames
PageRank_recommendation_names=pagerank_clearer(PageRank_recommendation_ids, g)
print(f"Recomendations for user {user_name} with id {user_id} using Personalized PageRank:")
PageRank_recommendation_names

[('kasmouse', 0.0807136386394075),
 ('dennis91842840', 0.06518142272913913),
 ('N30Foll0w', 0.06518142272913913),
 ('FloydStad', 0.06518142272913913)]

----------------------------------------------------------------------------------------------------------------------------

## Node2vec

In [78]:
#In order to apply Node2vec, we need to convert the igraph to a networkx graph
A = g.get_edgelist()
graph = nx.Graph(A) # In case your graph is directed

In [80]:
#Initialize Node2vec model
node2vec = node2vec.Node2Vec(graph, dimensions=64, walk_length=2, num_walks=200, workers=4) 
#Train Node2vec model
model = node2vec.fit(window=10, min_count=1, batch_words=4)

HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', max=7377.0, style=Pr…




In [63]:
#Get a list of the recommended users with their respective scores of the initial/main node
user_id_str=str(user_id)
node2vec_recommendation_ids=model.wv.most_similar(user_id_str)

In [64]:
#Transform user ids to user names
node2vec_recommendation_names=number_to_username(node2vec_recommendation_ids, g)

In [65]:
print(f"Recomendations for user {user_name} with id {user_id} using Node2vec:")
node2vec_recommendation_names

[('FloydStad', 0.9992849826812744),
 ('N30Foll0w', 0.9992270469665527),
 ('dennis91842840', 0.9991434216499329),
 ('kasmouse', 0.9295623302459717),
 ('Alexbobby2262C', 0.6820331811904907),
 ('ZeitlowGal', 0.6716615557670593),
 ('BJohonson', 0.6669320464134216),
 ('raslady1', 0.6645399928092957),
 ('MAGA777999', 0.664025068283081),
 ('StevenS82419733', 0.6636382341384888)]

----------------------------------------------------------------------------------------------------------------------------

## DOC2VEC

We wanted to exploit the text feature using Doc2Vec to recommend tweets based on the content of it. This was an opportunity for us to compare Word2Vec and Doc2Vec and see the pros and cons of both of them. So in this case, the nodes are not users but tweets, and based in the content a a tweet, we propose 10 tweets (Doc2Vec uses cosine similarity to do the recommendation, as well as Word2Vec).

In [190]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

# Initialization of Doc2Vec
def initialize_doc2vec(tweets: pd.DataFrame):
    tweets_ = []
    i = 0
    # Preparing the data to be put in Doc2Vec
    for line in tweets["original_text"]:
        tokens = simple_preprocess(line)
        tweets_.append(TaggedDocument(tokens, [i]))
        i += 1
    #Train the data and return
    d2v_model = Doc2Vec(documents=tweets_, vector_size=100, window=2, min_count=1, negative = 0, workers=4)
    return d2v_model

In [191]:
d2v_model = initialize_doc2vec(search_engine.tweets)

In [201]:
# This function returns the top 10 similar tweets related to a given tweet
def tweet2vec_top10(input_: str, tweets: pd.DataFrame):
    #Infer the vector to be able to let Doc2Vec do Cosine similarity
    embedded_input= d2v_model.infer_vector(input_.split())
    #Perform cosine similarity and return the top 10
    recommendations = d2v_model.docvecs.most_similar([embedded_input])
    #Preparing a DataFrame to return the top 10, by putting in the first row the input
    recommended_tweets = [input_]
    cosine_similarities = [1]
    results = pd.DataFrame(columns = ["Tweet", "Similarity"])
    # Preparing the list of the top 10
    for position, cos_similarity in recommendations:
        recommended_tweets.append(tweets["original_text"][position])
        cosine_similarities.append(str(cos_similarity))
    # Putting the lists in the DataFrame and returning
    results["Tweet"] = recommended_tweets
    results["Similarity"] = cosine_similarities
    return results

In [202]:
pd.set_option('display.max_colwidth', -1)
test = tweet2vec_top10(search_engine.tweets["original_text"][0], search_engine.tweets)
test

Unnamed: 0,Tweet,Similarity
0,"RT @DGPurser: December 7, 1941, a day which will live in infamy. Japan woke a sleeping tiger. So has the left - cheaters, the deep state,…",1.0
1,RT @Boyd_2650: 🔴🔵This election is NOT OVER &amp; Joe Biden HAS NOT WON! And we patriots CANNOT STAND BY &amp; LET EVIL LIBS &amp; FOREIGN COUNTRIES TRY…,0.3682688176631927
2,RT @davidfrum: Newt Gingrich's former seat in US House of Representatives is now held by a black woman who advocates stricter gun laws. Gin…,0.3492737412452698
3,RT @AdamParkhomenko: It really is fucking incredible the way trump can get caught committing a crime on Saturday and the whole damn city of…,0.3406864404678345
4,This is what Trump meant when he said Build the Wall,0.3288267254829407
5,@CNN TRUMP WON TRUMP WON TRUMP WON TRUMP WON FOUR MORE YEARS FOUR MORE YEARS ❤️😎❤️😎❤️😎❤️😎❤️😎❤️😎❤️,0.3229094445705414
6,RT @SidneyPowell1: And BIG TECH &amp; #Facebook #Google #Twitter are all into suppressing our freedom of speech to challenge this outrageous #E…,0.3153973221778869
7,"RT @TomFitton: ELECTION CRISIS: Are @sendavidperdue, @KLoeffler and other Senators prepared to object to Electoral College electors from st…",0.3143534064292907
8,"RT @KelemenCari: Judge asks Powell if Trump wins Georgia, can he win the election. Powell answers ""Yes, he can."" Fraud cannot be allowed to…",0.3082136809825897
9,"RT @thehill: Over 1,500 attorneys sign letter slamming Trump legal team's efforts to overturn election results http…",0.30532968044281
