## PageRank

#### Implement PageRank Algorithms by Numpy

![title](./pagerank.PNG)

In [1]:
import numpy as np

edge_mat = np.array([[0, 0.5, 0, 0.5, 0],
                     [0, 0, 0.5, 0, 0.5],
                     [0, 0, 0, 1, 0], 
                     [0, 0, 0, 0, 1], 
                     [1./3, 1./3, 1./3, 0, 0]])
edge_mat

array([[0.        , 0.5       , 0.        , 0.5       , 0.        ],
       [0.        , 0.        , 0.5       , 0.        , 0.5       ],
       [0.        , 0.        , 0.        , 1.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 1.        ],
       [0.33333333, 0.33333333, 0.33333333, 0.        , 0.        ]])

In [6]:
state = np.array([0.2, 0.2, 0.2, 0.2, 0.2]).reshape(1, 5) # initial weight of each node

def next_state(state, edge_mat):
    return state.dot(edge_mat)

# convergence
for _ in range(50):
    print(state[0])
    state = next_state(state, edge_mat)

[0.2 0.2 0.2 0.2 0.2]
[0.06666667 0.16666667 0.16666667 0.3        0.3       ]
[0.1        0.13333333 0.18333333 0.2        0.38333333]
[0.12777778 0.17777778 0.19444444 0.23333333 0.26666667]
[0.08888889 0.15277778 0.17777778 0.25833333 0.32222222]
[0.10740741 0.15185185 0.1837963  0.22222222 0.33472222]
[0.11157407 0.16527778 0.1875     0.2375     0.29814815]
[0.09938272 0.15516975 0.1820216  0.24328704 0.32013889]
[0.10671296 0.15640432 0.18429784 0.23171296 0.32087191]
[0.1069573  0.16031379 0.18515947 0.23765432 0.30991512]
[0.10330504 0.15678369 0.18346193 0.23863812 0.31781121]
[0.10593707 0.15758959 0.18432892 0.23511445 0.31702996]
[0.10567665 0.15864519 0.18447145 0.23729745 0.31390925]
[0.10463642 0.15747474 0.18395901 0.23730978 0.31662005]
[0.10554002 0.15785822 0.18427739 0.23627722 0.31604715]
[0.10534905 0.15811906 0.18427816 0.2370474  0.31520633]
[0.10506878 0.1577433  0.18412831 0.23695269 0.31610693]
[0.10536898 0.15790336 0.18424063 0.2366627  0.31582434]
[0.105274

## Text Summary by TextRank

#### Cited from https://www.analyticsvidhya.com/blog/2018/11/introduction-text-summarization-textrank-python/

In [1]:
import numpy as np
import pandas as pd
import nltk
import re

df = pd.read_csv('tennis_articles_v4.csv')
df.head()

Unnamed: 0,article_id,article_text,source
0,1,Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,"BASEL, Switzerland (AP), Roger Federer advance...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


In [2]:
from nltk.tokenize import sent_tokenize

sents = []

for text in df['article_text']:
    sents.append( sent_tokenize(text) )
    
sents[0]

['Maria Sharapova has basically no friends as tennis players on the WTA Tour.',
 "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.",
 'I think everyone knows this is my job here.',
 "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.",
 "I'm a pretty competitive girl.",
 "I say my hellos, but I'm not sending any players flowers as well.",
 "Uhm, I'm not really friendly or close to many players.",
 "I have not a lot of friends away from the courts.'",
 'When she said she is not really close to a lot of players, is that something strategic that she is doing?',
 "Is it different on the men's tour than the women's tour?",
 "'No, not at all.

In [3]:
# flatten list
print(len(sents))
sents = [sent for text in sents for sent in text]
print(len(sents))

8
119


In [11]:
clean_sents = pd.Series(sents).str.replace("[^A-Za-z]", " ").map(lambda s : s.lower())
clean_sents[:5]

0    maria sharapova has basically no friends as te...
1    the russian player has no problems in openly s...
2          i think everyone knows this is my job here 
3    when i m on the courts or when i m on the cour...
4                       i m a pretty competitive girl 
dtype: object

In [12]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

new_clean_sents = [" ".join([word for word in sent.split() if word not in stop_words]) 
                   for sent in clean_sents]
new_clean_sents[:5]

['maria sharapova basically friends tennis players wta tour',
 'russian player problems openly speaking recent interview said really hide feelings much',
 'think everyone knows job',
 'courts court playing competitor want beat every single person whether locker room across net one strike conversation weather know next minutes go try win tennis match',
 'pretty competitive girl']

In [13]:
## extract word vectors
import pickle

with open("/data/charley/wiki.en.pkl", 'rb') as f:
    w2v = pickle.load(f)

In [33]:
# cal sentences' vectors
def cal_sent_vec(sent):
    words_vec = []
    for word in sent.split():
        try:
            vec = w2v[word]
        except:
            vec = None
        if vec is not None:
            words_vec.append(vec)
    if words_vec:
        words_vec = np.mean(np.stack(words_vec, axis=0), axis=0)
    else:
        words_vec = np.zeros(300)
    return words_vec

sent_vectors = np.zeros((len(new_clean_sents), 300))
for i, sent in enumerate(new_clean_sents):
    sent_vectors[i] = cal_sent_vec(sent)

In [34]:
sent_vectors.shape

(119, 300)

In [45]:
# Similarity Matrix Preparation : cosine similarity approach
from sklearn.metrics.pairwise import cosine_similarity

n_sents = len(sent_vectors)
sim_mat = np.zeros((n_sents, n_sents))
print("Similarity Matrix shape : ", sim_mat.shape)

flag = 0 # should be 7140 = (119 * 119 + 119) / 2
for i in range(n_sents):
    for j in range(i, n_sents):
        flag += 1
        sim_mat[i, j] = cosine_similarity(sent_vectors[i].reshape(1,300), sent_vectors[j].reshape(1,300))[0, 0]
        sim_mat[j, i] = sim_mat[i, j]
print("Calculate times : ", flag)

Similarity Matrix shape :  (119, 119)
Calculate times :  7140


In [53]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)
print(scores)

{0: 0.008257385300905444, 1: 0.008552984173964814, 2: 0.007377628949334913, 3: 0.009500026954462613, 4: 0.007460904161930153, 5: 0.007950160271358296, 6: 0.008270276913897449, 7: 0.0077376501288717965, 8: 0.008382238679931207, 9: 0.007144537857265566, 10: 0.0012695725958021694, 11: 0.008794953892181242, 12: 0.00765210154388084, 13: 0.007838706343377553, 14: 0.008578805509609978, 15: 0.007731192394003882, 16: 0.00714876275318397, 17: 0.00885263832287492, 18: 0.008936251703270441, 19: 0.009237010737111515, 20: 0.009191967318032599, 21: 0.008185553727876263, 22: 0.008913683543301123, 23: 0.008879650290175928, 24: 0.008069469453047439, 25: 0.007355214444275351, 26: 0.009235649742231908, 27: 0.009162692421209984, 28: 0.009245687330435279, 29: 0.009392227773811499, 30: 0.009565760547092982, 31: 0.008833157794485645, 32: 0.0065201382758553784, 33: 0.007838593094264854, 34: 0.009278410185449353, 35: 0.00926975193811599, 36: 0.008693585838734436, 37: 0.009211432148630802, 38: 0.0092494919457673

In [54]:
# all scores == 1
sumv = 0
for k, v in scores.items():
    sumv += v
sumv

0.9999999999999998

In [58]:
# Summary Extraction
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sents)), reverse=True)

# Extract top 10 sentences as the summary
for i in range(10):
    print(i, '--', ranked_sentences[i][1])
    print()

0 -- Speaking at the Swiss Indoors tournament where he will play in Sundays final against Romanian qualifier Marius Copil, the world number three said that given the impossibly short time frame to make a decision, he opted out of any commitment.

1 -- When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.

2 -- But first the 20-time Grand Slam winner wants to train on the Paris Masters court this afternoon before deciding whether to appear for his opening match against either Milos Raonic or Jo-Wilfried Tsonga.

3 -- Currently in ninth place, Nishikori with a win could move to within 125 points of the cut for the eight-man event in London next month.

4 -- Roger Federer has revealed that organisers of the re-launched and condensed D