In [1]:
import re
import string
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer

def retrieve_docs_and_clean():
  # Untuk mendapatkan link berita populer
  r = requests.get('https://bola.kompas.com/')
  soup = BeautifulSoup(r.content, 'html.parser')

  link = []
  for i in soup.find('div', {'class':'most__wrap'}).find_all('a'):
      i['href'] = i['href'] + '?page=all'
      link.append(i['href'])

  # Retrieve Paragraphs
  documents = []
  for i in link:
      r = requests.get(i)
      soup = BeautifulSoup(r.content, 'html.parser')

      sen = []
      for i in soup.find('div', {'class':'read__content'}).find_all('p'):
          sen.append(i.text)
      documents.append(' '.join(sen))

  # Clean Paragraphs
  documents_clean = []
  for d in documents:
      document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
      document_test = re.sub(r'@\w+', '', document_test)
      document_test = document_test.lower()
      document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
      document_test = re.sub(r'[0-9]', '', document_test)
      document_test = re.sub(r'\s{2,}', ' ', document_test)
      documents_clean.append(document_test)

  return documents_clean

In [2]:
docs = retrieve_docs_and_clean()

# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names())
print(df.head())
print(df.shape)

               0         1         2         3         4         5         6  \
ac      0.000000  0.000000  0.000000  0.000000  0.309839  0.000000  0.000000   
acara   0.000000  0.000000  0.000000  0.000000  0.000000  0.028916  0.000000   
ad      0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.026721   
ada     0.054509  0.023772  0.070122  0.000000  0.026088  0.015523  0.000000   
adalah  0.021804  0.000000  0.023374  0.030743  0.000000  0.000000  0.000000   

               7        8         9  
ac      0.000000  0.00000  0.124364  
acara   0.000000  0.00000  0.000000  
ad      0.000000  0.00000  0.000000  
ada     0.000000  0.00000  0.019634  
adalah  0.027141  0.04256  0.058901  
(1443, 10)


In [3]:
docs = retrieve_docs_and_clean()
# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names())
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
ac,0.0,0.0,0.0,0.0,0.309839,0.0,0.0,0.0,0.0,0.124364
acara,0.0,0.0,0.0,0.0,0.0,0.028916,0.0,0.0,0.0,0.0
ad,0.0,0.0,0.0,0.0,0.0,0.0,0.026721,0.0,0.0,0.0
ada,0.054509,0.023772,0.070122,0.0,0.026088,0.015523,0.0,0.0,0.0,0.019634
adalah,0.021804,0.0,0.023374,0.030743,0.0,0.0,0.0,0.027141,0.04256,0.058901


In [4]:
def get_similar_articles(q, df):
  print("query:", q)
  print("Berikut artikel dengan nilai cosine similarity tertinggi: ")
  q = [q]
  q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
  sim = {}
  for i in range(10):
    sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
  sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  
  for k, v in sim_sorted:
    if v != 0.0:
      print("Nilai Similaritas:", v)
      print(docs[k])
      print()


q1 = 'barcelona'
q2 = 'gareth bale'
q3 = 'shin tae yong'

get_similar_articles(q1, df)
print('-'*100)
get_similar_articles(q2, df)
print('-'*100)
get_similar_articles(q3, df)

query: barcelona
Berikut artikel dengan nilai cosine similarity tertinggi: 
----------------------------------------------------------------------------------------------------
query: gareth bale
Berikut artikel dengan nilai cosine similarity tertinggi: 
----------------------------------------------------------------------------------------------------
query: shin tae yong
Berikut artikel dengan nilai cosine similarity tertinggi: 
