In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import nltk
import re

In [2]:
df = pd.read_csv("tennis_articles_v4.csv")

In [3]:
df.shape

(8, 3)

In [4]:
df.head()

Unnamed: 0,article_id,article_text,source
0,1,Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,"BASEL, Switzerland (AP), Roger Federer advance...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


In [5]:
df.article_text.apply(len)

0    1561
1    1331
2    2063
3    1341
4    2076
5    1545
6    1079
7    1833
Name: article_text, dtype: int64

In [24]:
! pip install iteration-utilities

Collecting iteration-utilities
[?25l  Downloading https://files.pythonhosted.org/packages/7a/be/cd57f7f8d337a30290187b39bceff717895b7963ba0816b3af3a01f17323/iteration_utilities-0.9.0-cp36-cp36m-macosx_10_14_x86_64.whl (91kB)
[K    100% |████████████████████████████████| 92kB 1.4MB/s ta 0:00:01
[?25hInstalling collected packages: iteration-utilities
Successfully installed iteration-utilities-0.9.0
[33mYou are using pip version 18.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


__ Break the articles into sentences__

In [28]:
# Tokenize the articles into sentences and flat the output into a single list of sentences
from nltk.tokenize import sent_tokenize
from iteration_utilities import deepflatten
sentences = list(deepflatten(df["article_text"].apply(lambda x: sent_tokenize(x)), depth=1))
sentences[0]

'Maria Sharapova has basically no friends as tennis players on the WTA Tour.'

__Let's get GloVe word embedding__

In [29]:
! wget http://nlp.stanford.edu/data/glove.6B.zip
! unzip glove*.zip

--2019-11-07 15:10:07--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-11-07 15:10:07--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2019-11-07 15:10:07--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-1

__Extracting word vectors__

In [32]:
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

__ Applying NLP Preprocessing steps (removing stopwords, punctuation as well as making all the letters lowercase)__

In [96]:
sents = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")
sents = [s.lower() for s in sents]

In [97]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [98]:
# function to remove stopwords
def remove_stopwords(sent):
    sents_new = " ".join([i for i in sent if i not in stop_words])
    return sents_new

In [99]:
# remove stopwords from the sentences
sents = [remove_stopwords(r.split()) for r in sents]

In [100]:
sents

['maria sharapova basically friends tennis players wta tour',
 'russian player problems openly speaking recent interview said really hide feelings much',
 'think everyone knows job',
 'courts court playing competitor want beat every single person whether locker room across net one strike conversation weather know next minutes go try win tennis match',
 'pretty competitive girl',
 'say hellos sending players flowers well',
 'uhm really friendly close many players',
 'lot friends away courts',
 'said really close lot players something strategic',
 'different men tour women tour',
 '',
 'think sport mean friends everyone categorized tennis player going get along tennis players',
 'think every person different interests',
 'friends completely different jobs interests met different parts life',
 'think everyone thinks tennis players greatest friends',
 'ultimately tennis small part',
 'many things interested',
 'basel switzerland ap roger federer advanced th swiss indoors final career beati

In [101]:
sent_vecs = []
for i in sents:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((100,))
    sent_vecs.append(v)

__Similarity Matrix created by cosine similarity__

In [104]:
# similarity matrix
similarity_matrix = np.zeros([len(sentences), len(sentences)])

In [105]:
# Cosine similarity 
from sklearn.metrics.pairwise import cosine_similarity
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            similarity_matrix[i][j] = cosine_similarity(sent_vecs[i].reshape(1,100), sent_vecs[j].reshape(1,100))[0,0]

__ Apply PageRank __

In [106]:
# Create a graph from the similarity matrix
import networkx as nx

nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)

In [107]:
scores

{0: 0.008072651812837249,
 1: 0.008501993305467095,
 2: 0.007811931832998158,
 3: 0.009293791287246101,
 4: 0.007500319341702238,
 5: 0.008146814822101289,
 6: 0.008477413410840564,
 7: 0.008251000778010125,
 8: 0.008596957831776629,
 9: 0.00825714424529151,
 10: 0.0012695751770095795,
 11: 0.008860552382933833,
 12: 0.008083543253002173,
 13: 0.008156804670239782,
 14: 0.00844331689249351,
 15: 0.00855689304327121,
 16: 0.007812826598034563,
 17: 0.008071958126597788,
 18: 0.008406020981041979,
 19: 0.008847892197876472,
 20: 0.008860865272110666,
 21: 0.0074219170502842395,
 22: 0.00822343401476627,
 23: 0.008991766362928408,
 24: 0.008463970416187735,
 25: 0.0067018981235010605,
 26: 0.008232471583451702,
 27: 0.008913135597756963,
 28: 0.009061683003650822,
 29: 0.009093905759714915,
 30: 0.009244521571573308,
 31: 0.008994323928725814,
 32: 0.007236869161912383,
 33: 0.00870909302347104,
 34: 0.008919130492987544,
 35: 0.009097421441929054,
 36: 0.007715970772210733,
 37: 0.008883

__Extract top 10 sentences as the summary __

In [108]:
# Rank the sentences and extract top 10 sentences as the summary
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

for i in range(10):
    print(ranked_sentences[i][1])

When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.
Major players feel that a big event in late November combined with one in January before the Australian Open will mean too much tennis and too little rest.
Speaking at the Swiss Indoors tournament where he will play in Sundays final against Romanian qualifier Marius Copil, the world number three said that given the impossibly short time frame to make a decision, he opted out of any commitment.
"I felt like the best weeks that I had to get to know players when I was playing were the Fed Cup weeks or the Olympic weeks, not necessarily during the tournaments.
Currently in ninth place, Nishikori with a win could move to within 125 points of the cut for the eight-man event in London 