In [18]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

load_from_drive = True

#load the data from Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount='true')
csv_path = '/content/drive/My Drive/call_of_duty_articles.csv' 

nltk.download('stopwords')
stop_words = stopwords.words('english')


#need to add this encoding because different types
df = pd.read_csv(csv_path, encoding='latin1')

#now we need to take sentences from the text and store them
#then need to loop through each to remove them from different articles
sentences = []
sentences = [sent_tokenize(x) for x in df['article_text']]
sentences = [y for x in sentences for y in x]

#function to remove stopwords
def clean_stopwords(sent):
      sen_new = " ".join([i for i in sent if i not in stop_words])
      return sen_new

#this is the word embedding
txt_path = '/content/drive/My Drive/glove.6B.100d.txt' 
word_embeddings = {}
f = open(txt_path, encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

#"clean" the words to remove special characters, numbers
cleaned = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

#make it lower case
cleaned = [x.lower() for x in cleaned]

removed_stopwords = [clean_stopwords(r.split()) for r in cleaned]

#now we need to create vectors for sentences

sentence_vector = []

for sent in removed_stopwords:
  if len(sent) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,)))
                           for w in sent.split()])/(len(sent.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vector.append(v)

#similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vector[i].reshape(1,100), 
                                        sentence_vector[j].reshape(1,100))[0,0]

#to make this a graph
#nodes are sentences and edges are similarity scores
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), 
                          reverse=True)

#Extracts the top sentences as the summary
summary = 10
for i in range(summary):
  print(ranked_sentences[i][1])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Mounted at /content/drive
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
âI think it speaks to the strength of what weâre building and really gives us that confidence going into Season 2.â But when it came to the seasonâs eventual uptick, reaching a pre-playoffs peak of 115k viewers with the New York Home Series in July, Faries said it turned around when they were able to shift their focus from pivoting to remote play to the content itself.
âWhen we poured our focus and our creative muscle behind not only operating in a seamless way in light of COVID-19, but then really being able to optimize the fan experience with a content-first lens, you saw immediately the growth that we started to enjoy.â And now that CDL has crossed the finish line on its inaugural season, itâs already announced one bi