# TFID and PageRank Application on MIND

In [1]:
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, lil_matrix, csr_matrix
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

In [3]:
news = pd.read_csv('MIND_Dataset/MINDprocessed/news_encoded.csv')
news.head()

Unnamed: 0,NewsID,Category,SubCategory,Title,Abstract,URL,Title Entities,Abstract Entities,TitleClust
0,50588,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],3
1,10711,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",7
2,57598,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",15
3,48364,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",14
4,31473,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...",4


In [4]:
def cos_pdist(X, Y):
    """
    help from https://stackoverflow.com/a/43493487
    X: scipy.sparse CSR matrix, shape (m1, n)
    Y: scipy.sparse CSR matrix, shape (m2, n)
    returns: pairwise cosine distance between X and Y, shape (m1, m2)
    """
    sumyy = np.asarray((Y.power(2)).sum(1)).flatten()
    sumxx = np.asarray((X.power(2)).sum(1))
    sumxy = X.dot(Y.T).toarray()
    return (sumxy/np.sqrt(sumxx))/np.sqrt(sumyy)

def compute_sim_matrix(combined_vectors,pb=False):
    M = combined_vectors.shape[0]

    sim_matrix = lil_matrix((M, M))
    if pb:
        for i in tqdm(range(M), desc="Processing rows"):
            s = cos_pdist(combined_vectors[i], combined_vectors[i:])
            vect = np.nan_to_num(s[0].tolist(), nan=0)
            row_i = sim_matrix[i, :]
            filled_vect = np.concatenate((np.zeros(i, dtype=row_i.dtype), vect))
            sim_matrix[i, :] = filled_vect
    else:
        for i in range(M):
            s = cos_pdist(combined_vectors[i], combined_vectors[i:])
            vect = np.nan_to_num(s[0].tolist(), nan=0)
            row_i = sim_matrix[i, :]
            filled_vect = np.concatenate((np.zeros(i, dtype=row_i.dtype), vect))
            sim_matrix[i, :] = filled_vect
    return sim_matrix

class TfidfRecommender:
    def __init__(self):
        self.vectorizer1 = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
        self.vectorizer2 = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
        self.cosine_sim_matrix = None
        self.data = None
        self.title_vectors = None
        self.attr_vectors = None
        self.fitted = False

    def fit(self, data):
        self.data = data.copy()
        #name = data.columns[0]
        attribute1 = data.columns[1]
        attribute2 = data.columns[2]
        
        self.title_vectors = self.vectorizer1.fit_transform(data[attribute1])
        self.attr_vectors = self.vectorizer2.fit_transform(data[attribute2])
        # Combine vectors
        combined_vectors = hstack((self.title_vectors, self.attr_vectors))
        
        # Compute cosine similarity matrix
        self.cosine_sim_matrix = compute_sim_matrix(combined_vectors)

        self.fitted = True

    def recommend(self, attr1=[], attr2=[], top_n=5):
        if not self.fitted:
            raise("Error, recommender not fitted")
        else:
            name = self.data.columns[0]
            attribute1 = self.data.columns[1]
            attribute2 = self.data.columns[2]
            
            # Filter movies based on specified attr1
            if len(attr1)>0:
                attr1_items = self.data[self.data[attribute1].apply(lambda x: any(item in attr1 for item in x) if isinstance(x, list) else x in attr1)]
            else:
                attr1_items = self.data

            # Filter movies based on specified attr2
            if len(attr2)>0:
                attr2_items = self.data[self.data[attribute2].apply(lambda x: any(item in attr2 for item in x) if isinstance(x, list) else x in attr2)]
            else:
                attr2_items = self.data

            # Get intersection of movies based on genres and actors
            selected_movies = attr1_items[attr1_items.index.isin(attr2_items.index)]
            #size = sys.getsizeof(selected_movies)
            
            if selected_movies.empty:
                return "No items found for the given attributes."

            # Get the index of the selected movies in the movie data
            selected_indices = selected_movies.index

            # Compute the average cosine similarity for each selected movie
            avg_similarity = self.cosine_sim_matrix[selected_indices].mean(axis=0)

            # Sort movies based on average similarity in descending order
            sorted_indices = np.argsort(avg_similarity.tolist()[0])[::-1].tolist()

            # Get the top N recommended movie indices
            top_indices = sorted_indices[:top_n]

            # Get the top N recommended movie titles
            recommended_movies = self.data.iloc[top_indices,:][name]

            return recommended_movies


In [5]:
# Sample movie data
movie_data = pd.DataFrame({
    'Title': ['Movie A', 'Movie B', 'Movie C', 'Movie D'],
    'Genres': [['Action', 'Thriller'], ['Action','Drama','Romance'], ['Comedy'], ['Action', 'Comedy']],
    'Actors': [['Actor1', 'Actor2'], ['Actor3', 'Actor4'], ['Actor2'], ['Actor1', 'Actor4']]
})

# Create an instance of the TfidfRecommender class
recommender = TfidfRecommender()

# Fit the recommender with the movie data
recommender.fit(movie_data)

In [6]:
# Make recommendations for a specific genre
recommended_movies = recommender.recommend(attr2=['Actor4'], top_n=3)
print(recommended_movies)

print("##########################")
# Make recommendations for a specific genre
recommended_movies = recommender.recommend(attr1=['Action','Drama'], top_n=3)
print(recommended_movies)

3    Movie D
1    Movie B
2    Movie C
Name: Title, dtype: object
##########################
3    Movie D
1    Movie B
0    Movie A
Name: Title, dtype: object


In [7]:
news_reduced = news.head(40000)
news_reduced.Category.value_counts()

Category
news             12324
sports           10715
finance           2548
foodanddrink      2137
lifestyle         2068
travel            1794
health            1623
video             1511
weather           1496
autos             1369
tv                 766
music              608
movies             524
entertainment      503
kids                12
middleeast           2
Name: count, dtype: int64

In [8]:
# Create an instance of the TfidfRecommender class
recommender = TfidfRecommender()

# Fit the recommender with the movie data
recommender.fit(news_reduced[['Title','Category','SubCategory']])

# Make recommendations for a specific genre
recommended_movies = recommender.recommend(attr1=['sports'], top_n=10)
print(recommended_movies)

print("##########################")
# Make recommendations for a specific genre
recommended_movies = recommender.recommend(attr1=['kids'], top_n=10)
print(recommended_movies)

In [3]:
# Create an empty graph
G = nx.Graph()

# Add nodes and edges to the graph
for _, row in news.iterrows():
    source = row['NewsID']
    target1 = row['Category']
    target2 = row['SubCategory']
    relation1 = 'Has category'
    relation2 = 'Has subcategory'
    G.add_node(source)
    G.add_node(target1)
    G.add_node(target2)
    G.add_edge(source, target1, relation=relation1)
    G.add_edge(source, target2, relation=relation2)


In [4]:
# Visualize the graph
#pos = nx.spring_layout(G)
#nx.draw_networkx_nodes(G, pos, node_color='lightblue')
#nx.draw_networkx_edges(G, pos, edge_color='gray')
#nx.draw_networkx_labels(G, pos)
#nx.draw_networkx_edge_labels(G, pos, edge_labels=nx.get_edge_attributes(graph, 'relation'))
#plt.axis('off')
#plt.show()

In [5]:
# Calculate PageRank scores with edge weights
pagerank_scores = nx.pagerank(G, weight='weight')

# Sort the nodes by PageRank score in descending order
sorted_nodes = sorted(pagerank_scores, key=pagerank_scores.get, reverse=True)

for id in news.NewsID:
    sorted_nodes.remove(id)

In [6]:
# Print the ranking
for node in sorted_nodes[:10]:
    print(node, pagerank_scores[node])

news 0.07889004511242838
sports 0.07025342294647274
newsus 0.03204629339752906
football_nfl 0.027057158176892776
finance 0.013806644622683455
newspolitics 0.01261043291687287
lifestyle 0.011490493211613526
foodanddrink 0.011365739885240325
travel 0.010952166838212757
video 0.01024599373031153


# TF-IDF

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Sample movie data
movie_data = pd.DataFrame({
    'Title': ['Movie A', 'Movie B', 'Movie C', 'Movie D'],
    'Genres': [['Action', 'Thriller'], ['Drama', 'Romance'], ['Comedy'], ['Action', 'Comedy']]
})

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(lowercase=False)

# Fit and transform the movie titles
title_vectors = vectorizer.fit_transform(movie_data['Title'])

# Compute cosine similarity matrix
cosine_sim_matrix = cosine_similarity(title_vectors)

def recommend_movies_by_genre(genre, movie_data, cosine_sim_matrix, top_n=5):
    # Get indices of movies with the specified genre
    genre_movies = movie_data[movie_data['Genres'].apply(lambda x: genre in x)]

    if genre_movies.empty:
        return "No movies found for the given genre."

    # Get the index of the genre movie in the movie data
    genre_movie_indices = genre_movies.index

    # Compute the average cosine similarity for each genre movie
    avg_similarity = cosine_sim_matrix[genre_movie_indices].mean(axis=0)

    # Sort movies based on average similarity in descending order
    sorted_indices = avg_similarity.argsort()[::-1]

    # Get the top N recommended movie indices
    top_indices = sorted_indices[:top_n]

    # Get the top N recommended movie titles
    recommended_movies = movie_data.iloc[top_indices]['Title']

    return recommended_movies

# Example usage
genre = 'Action'
recommendations = recommend_movies_by_genre(genre, movie_data, cosine_sim_matrix)
print(f"Recommended movies in the {genre} genre:")
print(recommendations)


Recommended movies in the Action genre:
3    Movie D
2    Movie C
1    Movie B
0    Movie A
Name: Title, dtype: object


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from scipy.sparse import hstack

# Sample movie data
movie_data = pd.DataFrame({
    'Title': ['Movie A', 'Movie B', 'Movie C', 'Movie D'],
    'Genres': [['Action', 'Thriller'], ['Drama', 'Romance'], ['Comedy'], ['Action', 'Comedy']],
    'Actors': [['Actor1', 'Actor2'], ['Actor3', 'Actor4'], ['Actor2'], ['Actor1', 'Actor4']]
})

# Create TF-IDF vectorizer for movie titles
title_vectorizer = TfidfVectorizer(lowercase=False)
title_vectors = title_vectorizer.fit_transform(movie_data['Title'])

# Create TF-IDF vectorizer for movie actors
actor_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
actor_vectors = actor_vectorizer.fit_transform(movie_data['Actors'])

# Combine title and actor vectors
combined_vectors = hstack((title_vectors, actor_vectors))



In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from scipy.sparse import hstack

# Sample movie data
movie_data = pd.DataFrame({
    'Title': ['Movie A', 'Movie B', 'Movie C', 'Movie D'],
    'Genres': [['Action', 'Thriller'], ['Drama', 'Romance'], ['Comedy'], ['Action', 'Comedy']],
    'Actors': [['Actor1', 'Actor2'], ['Actor3', 'Actor4'], ['Actor2'], ['Actor1', 'Actor4']]
})

# Create TF-IDF vectorizer for movie titles
title_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
title_vectors = title_vectorizer.fit_transform(movie_data['Title'])

# Create TF-IDF vectorizer for movie actors
actor_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
actor_vectors = actor_vectorizer.fit_transform(movie_data['Actors'])

# Combine title and actor vectors
combined_vectors = hstack((title_vectors, actor_vectors))

# Compute cosine similarity matrix
cosine_sim_matrix = cosine_similarity(combined_vectors)

def recommend_movies(genres, actors, movie_data, cosine_sim_matrix, top_n=5):
    # Filter movies based on specified genres
    if genres:
        genre_movies = movie_data[movie_data['Genres'].apply(lambda x: any(genre in x for genre in genres))]
    else:
        genre_movies = movie_data

    # Filter movies based on specified actors
    if actors:
        actor_movies = movie_data[movie_data['Actors'].apply(lambda x: any(actor in x for actor in actors))]
    else:
        actor_movies = movie_data

    # Get intersection of movies based on genres and actors
    selected_movies = genre_movies[genre_movies.index.isin(actor_movies.index)]

    if selected_movies.empty:
        return "No movies found for the given genres and actors."

    # Get the index of the selected movies in the movie data
    selected_indices = selected_movies.index

    # Compute the average cosine similarity for each selected movie
    avg_similarity = cosine_sim_matrix[selected_indices].mean(axis=0)

    # Sort movies based on average similarity in descending order
    sorted_indices = avg_similarity.argsort()[::-1]

    # Get the top N recommended movie indices
    top_indices = sorted_indices[:top_n]

    # Get the top N recommended movie titles
    recommended_movies = movie_data.iloc[top_indices]['Title']

    return recommended_movies


In [4]:
# Example usage
genres = ['Action', 'Thriller']
actors = ['Actor1', 'Actor2']
recommendations = recommend_movies(genres, actors, movie_data, cosine_sim_matrix)
print("Recommended movies based on genres and actors:")
print(recommendations)

Recommended movies based on genres and actors:
3    Movie D
0    Movie A
2    Movie C
1    Movie B
Name: Title, dtype: object


In [5]:
# Example usage
genres = ['Drama']
recommendations = recommend_movies(genres, actors=None, movie_data=movie_data, cosine_sim_matrix=cosine_sim_matrix)
print("Recommended movies based on genres:")
print(recommendations)

Recommended movies based on genres:
1    Movie B
3    Movie D
2    Movie C
0    Movie A
Name: Title, dtype: object


In [52]:
movie_data

Unnamed: 0,Title,Genres,Actors
0,Movie A,"[Action, Thriller]","[Actor1, Actor2]"
1,Movie B,"[Drama, Romance]","[Actor3, Actor4]"
2,Movie C,[Comedy],[Actor2]
3,Movie D,"[Action, Comedy]","[Actor1, Actor4]"


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

class TfidfRecommender:
    def __init__(self):
        self.vectorizer1 = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
        self.vectorizer2 = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
        self.cosine_sim_matrix = None
        self.data = None
        self.title_vectors = None
        self.attr_vectors = None
        self.fitted = False

    def fit(self, data):
        self.data = data.copy()
        name = data.columns[0]
        #attribute1 = data.columns[1]
        attribute2 = data.columns[2]
        
        self.title_vectors = self.vectorizer1.fit_transform(data[name])
        self.attr_vectors = self.vectorizer2.fit_transform(data[attribute2])
        # Combine vectors
        combined_vectors = hstack((self.title_vectors, self.attr_vectors))
        # Compute cosine similarity matrix
        self.cosine_sim_matrix = cosine_similarity(combined_vectors)
        
        self.fitted = True

    def recommend(self, attr1=None, attr2=None, top_n=5):
        if not self.fitted:
            raise("Error, recommender not fitted")
        else:
            name = self.data.columns[0]
            attribute1 = self.data.columns[1]
            attribute2 = self.data.columns[2]
            
            # Filter movies based on specified attr1
            if attr1:
                attr1_items = movie_data[movie_data[attribute1].apply(lambda x: attr1 in x)]
            else:
                attr1_items = self.data

            # Filter movies based on specified attr2
            if attr2:
                attr2_items = movie_data[movie_data[attribute2].apply(lambda x: attr2 in x)]
            else:
                attr2_items = self.data

            # Get intersection of movies based on genres and actors
            selected_movies = attr1_items[attr1_items.index.isin(attr2_items.index)]

            if selected_movies.empty:
                return "No items found for the given attributes."

            # Get the index of the selected movies in the movie data
            selected_indices = selected_movies.index

            # Compute the average cosine similarity for each selected movie
            avg_similarity = self.cosine_sim_matrix[selected_indices].mean(axis=0)

            # Sort movies based on average similarity in descending order
            sorted_indices = avg_similarity.argsort()[::-1]

            # Get the top N recommended movie indices
            top_indices = sorted_indices[:top_n]

            # Get the top N recommended movie titles
            recommended_movies = self.data.iloc[top_indices][name]

            return recommended_movies


In [25]:
# Sample movie data
movie_data = pd.DataFrame({
    'Title': ['Movie A', 'Movie B', 'Movie C', 'Movie D'],
    'Genres': [['Action', 'Thriller'], ['Drama', 'Romance'], ['Comedy'], ['Action', 'Comedy']],
    'Actors': [['Actor1', 'Actor2'], ['Actor3', 'Actor4'], ['Actor2'], ['Actor1', 'Actor4']]
})

# Create an instance of the TfidfRecommender class
recommender = TfidfRecommender()

# Fit the recommender with the movie data
recommender.fit(movie_data)

# Make recommendations for a specific genre
recommended_movies = recommender.recommend(attr2='Actor4', top_n=3)
print(recommended_movies)

print("##########################")
# Make recommendations for a specific genre
recommended_movies = recommender.recommend(attr1='Drama', top_n=3)
print(recommended_movies)


3    Movie D
1    Movie B
0    Movie A
Name: Title, dtype: object
##########################
1    Movie B
3    Movie D
2    Movie C
Name: Title, dtype: object




In [25]:
import pandas as pd

# Sample movie data
movie_data = pd.DataFrame({
    'Title': ['Movie A', 'Movie B', 'Movie C', 'Movie D'],
    'Genres': [['Action', 'Thriller'], ['Drama', 'Romance'], ['Comedy'], ['Action', 'Comedy']],
    'Actors': [['Actor1', 'Actor2'], ['Actor3', 'Actor4'], ['Actor2'], ['Actor1', 'Actor4']]
})

attribute1 = movie_data.columns[1]
attr1 = 'Drama'
attr1_items = movie_data[movie_data[attribute1].apply(lambda x: attr1 in x)]

print(attr1_items)


     Title            Genres            Actors
1  Movie B  [Drama, Romance]  [Actor3, Actor4]


In [22]:
attribute1 = 'genres'
movie_data[attribute1].apply(lambda x: any(a in x for a in attr1))

0    False
1    False
2    False
3    False
Name: Genres, dtype: bool

# PageRank

In [33]:
import networkx as nx

# Create a directed graph
graph = nx.DiGraph()

# Add nodes to the graph
graph.add_node('Movie A')
graph.add_node('Genre 1')
graph.add_node('Actor 1')

# Add labeled edges to the graph
graph.add_edge('Movie A', 'Genre 1', label='has_genre')
graph.add_edge('Movie A', 'Actor 1', label='has_actor')

# Calculate PageRank scores with edge weights
pagerank_scores = nx.pagerank(graph, weight='weight')

# Sort the nodes by PageRank score in descending order
sorted_nodes = sorted(pagerank_scores, key=pagerank_scores.get, reverse=True)

# Print the ranking
for node in sorted_nodes:
    print(node, pagerank_scores[node])


Genre 1 0.37012974744707666
Actor 1 0.37012974744707666
Movie A 0.25974050510584634


In [41]:
import networkx as nx

# Create a directed graph
graph = nx.DiGraph()

# Add nodes to the graph
graph.add_node('Movie A')
graph.add_node('Movie B')
graph.add_node('Movie C')
graph.add_node('Movie D')
graph.add_node('Movie E')
graph.add_node('Genre 1')
graph.add_node('Genre 2')
graph.add_node('Actor 1')
graph.add_node('Actor 2')

# Add labeled edges to the graph
graph.add_edge('Movie A', 'Genre 1', label='has_genre')
graph.add_edge('Movie A', 'Actor 1', label='has_actor')
graph.add_edge('Movie B', 'Genre 1', label='has_genre')
graph.add_edge('Movie B', 'Actor 2', label='has_actor')
graph.add_edge('Movie C', 'Genre 1', label='has_genre')
graph.add_edge('Movie C', 'Actor 1', label='has_actor')
graph.add_edge('Movie D', 'Genre 1', label='has_genre')
graph.add_edge('Movie D', 'Actor 2', label='has_actor')
graph.add_edge('Movie E', 'Genre 1', label='has_genre')
graph.add_edge('Movie E', 'Actor 1', label='has_actor')

# Calculate PageRank scores with edge weights
pagerank_scores = nx.pagerank(graph, weight='weight')

# Sort the nodes by PageRank score in descending order
sorted_nodes = sorted(pagerank_scores, key=pagerank_scores.get, reverse=True)

# Print the ranking
for node in sorted_nodes:
    print(node, pagerank_scores[node])


Genre 1 0.23584829392560236
Actor 1 0.17169774276386401
Actor 2 0.1396224671829948
Movie A 0.07547191602125641
Movie B 0.07547191602125641
Movie C 0.07547191602125641
Movie D 0.07547191602125641
Movie E 0.07547191602125641
Genre 2 0.07547191602125641


In [42]:
import networkx as nx

# Create a directed graph
graph = nx.DiGraph()

# Add nodes to the graph
graph.add_node('Movie A')
graph.add_node('Movie B')
graph.add_node('Movie C')
graph.add_node('Movie D')
graph.add_node('Movie E')
graph.add_node('Genre 1')
graph.add_node('Genre 2')
graph.add_node('Actor 1')
graph.add_node('Actor 2')

# Add labeled edges to the graph
graph.add_edge('Movie A', 'Genre 2', label='has_genre')
graph.add_edge('Movie A', 'Actor 1', label='has_actor')
graph.add_edge('Movie B', 'Genre 1', label='has_genre')
graph.add_edge('Movie B', 'Actor 2', label='has_actor')
graph.add_edge('Movie C', 'Genre 1', label='has_genre')
graph.add_edge('Movie C', 'Actor 1', label='has_actor')
graph.add_edge('Movie D', 'Genre 1', label='has_genre')
graph.add_edge('Movie D', 'Actor 2', label='has_actor')
graph.add_edge('Movie E', 'Genre 1', label='has_genre')
graph.add_edge('Movie E', 'Actor 1', label='has_actor')

# Calculate PageRank scores with edge weights
pagerank_scores = nx.pagerank(graph, weight='weight')

# Sort the nodes by PageRank score in descending order
sorted_nodes = sorted(pagerank_scores, key=pagerank_scores.get, reverse=True)

# Print the ranking
for node in sorted_nodes:
    print(node, pagerank_scores[node])


Genre 1 0.20377301834473324
Actor 1 0.17169774276386401
Actor 2 0.1396224671829948
Genre 2 0.10754719160212561
Movie A 0.07547191602125641
Movie B 0.07547191602125641
Movie C 0.07547191602125641
Movie D 0.07547191602125641
Movie E 0.07547191602125641


# Cose che potranno diventare utili

In [None]:
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix, lil_matrix
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import dask.array as da
import dask.dataframe as dd
from scipy.sparse.linalg import norm
from tqdm import tqdm
import multiprocessing as mp

# News data import
news = pd.read_csv('Mind_Dataset/MINDprocessed/news_encoded.csv')
news = news.head(10000)

vectorizer1 = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
vectorizer2 = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)

attribute1 = news.columns[1]
attribute2 = news.columns[2]

title_vectors = vectorizer1.fit_transform(news[attribute1])
attr_vectors = vectorizer2.fit_transform(news[attribute2])

# Combine vectors
combined_vectors = hstack((title_vectors, attr_vectors))

M = combined_vectors.shape[0]

#pool = mp.Pool(mp.cpu_count())
pool = mp.Pool(4)

sim_matrix = lil_matrix((M, M))

# Step 1: Redefine, to accept `i`, the iteration number
def cos_pdist(i, X, Y):
    """
    help from https://stackoverflow.com/a/43493487
    X: scipy.sparse CSR matrix, shape (m1, n)
    Y: scipy.sparse CSR matrix, shape (m2, n)
    returns: pairwise cosine distance between X and Y, shape (m1, m2)
    """
    sumyy = np.asarray((Y.power(2)).sum(1)).flatten()
    sumxx = np.asarray((X.power(2)).sum(1))
    sumxy = X.dot(Y.T).toarray()
    result = (sumxy/np.sqrt(sumxx))/np.sqrt(sumyy)
    return (i, result)


# Step 2: Define callback function to collect the output in `results`
def collect_result(i,result):
    global sim_matrix
    sim_matrix[i,:] = result

data = combined_vectors

with tqdm(total=data.shape[0]) as t:

    # Step 3: Use loop to parallelize
    for i, row in enumerate(data):
        #print("Prontoooooooooooooooooooooo??")
        pool.apply_async(cos_pdist, args=(i, data[i], data[i:]), callback=collect_result)
        t.update()
    # Step 4: Close Pool and let all the processes complete  
    print("Prontoooooooooooooooooooooo??")  
    pool.close()
    print("Prontoooooooooooooooooooooo??")
    pool.join()  # postpones the execution of next line of code until all processes in the queue are done.
    print("Prontoooooooooooooooooooooo??")