In [1]:
import requests
import re
import bs4
from bs4 import BeautifulSoup
import pandas as pd

def get_content_from_url(url:str):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

    



In [2]:
def get_start_end(a_tags):

    start_idx = None
    end_idx = None

    for idx, l in enumerate(a_tags):
        if start_idx is None and str(l).find("Boolean retrieval") > 0:
            start_idx = idx

        if end_idx is None and str(l).find("Link analysis") > 0:
            end_idx = idx
    return (start_idx, end_idx)

In [3]:
def get_context_url(tag : bs4.element.Tag,
                    base_url : str
                    )-> tuple():
    
    contents = tag.text
    tag = str(tag)
    
    link_start = tag.find("href")
    link_end = tag.find("html")
    url = tag[link_start + 6 : link_end + 4]
    
    url = base_url.split("/")[:-1] + [url]
    url = "/".join(url)
    
    return (contents, url)
    

In [4]:
def clear_context(content:str):
    content = " ".join(content.split())
    content = re.sub(r"[^0-9a-zA-Z\s]", "", content)
    return content
    

In [5]:
# base
base_url = "https://nlp.stanford.edu/IR-book/html/htmledition/irbook.html"
soup = get_content_from_url(base_url)
a_tags = soup.find_all("a")


# target
start_idx, end_idx = get_start_end(a_tags)
target_tags = a_tags[start_idx : end_idx + 1]
target_context_url = list(map(lambda x: get_context_url(x, base_url), target_tags))

print(target_context_url[0])

('Boolean retrieval', 'https://nlp.stanford.edu/IR-book/html/htmledition/boolean-retrieval-1.html')


In [28]:
# keyword
from keybert import KeyBERT
kw_model = KeyBERT()

results = {}
for chapter, url in target_context_url:
    content = get_content_from_url(url).text
    content = clear_context(content)
    
    # get keywrods
    keywords = kw_model.extract_keywords(content, top_n = 10)
    keywords = list(map(lambda x: x[0], keywords))
    
    # save
    results[chapter] = keywords

In [43]:
list(results.keys()), list(results.values())

(['Boolean retrieval',
  'The term vocabulary and postings lists',
  'Dictionaries and tolerant retrieval',
  'Index construction',
  'Index compression',
  'Scoring, term weighting and the vector space model',
  'Computing scores in a complete search system',
  'Evaluation in information retrieval',
  'Relevance feedback and query expansion',
  'XML retrieval',
  'Probabilistic information retrieval',
  'Language models for information retrieval',
  'Text classification and Naive Bayes',
  'Vector space classification',
  'Support vector machines and machine learning on documents',
  'Flat clustering',
  'Hierarchical clustering',
  'Matrix decompositions and latent semantic indexing',
  'Web search basics',
  'Web crawling and indexes',
  'Link analysis'],
 [['retrieval',
   'information',
   'documents',
   'searching',
   'queries',
   'searchers',
   'search',
   'data',
   'indexing',
   'database'],
  ['tokenization',
   'indexing',
   'indexes',
   'linguistic',
   'stemming',


In [47]:
df = pd.DataFrame({"csv_file" : list(results.keys()), "keywords" : list(results.values())})
df.head()

Unnamed: 0,csv_file,keywords
0,Boolean retrieval,"[retrieval, information, documents, searching,..."
1,The term vocabulary and postings lists,"[tokenization, indexing, indexes, linguistic, ..."
2,Dictionaries and tolerant retrieval,"[retrieval, dictionaries, searching, search, q..."
3,Index construction,"[indexing, indexers, indexes, indexer, index, ..."
4,Index compression,"[compression, compressed, retrieval, decompres..."


In [50]:
df.to_csv("Stanford_NLP_df.csv", index = False)

In [49]:
!ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 Bert.ipynb
 Concept-Extraction
 content.csv
 correlations.csv
 csv2wiki.pickle
 csv_dict.pickle
 csv_keywords_df.csv
 csv_wiki_graph
 csv_wiki_graph.pickle
 data.pickle
 Deepwalk.ipynb
 Deepwalk_practice.ipynb
 Embedding
 embedding.pickle
'(Final)Linkprediction.ipynb'
 gyuseok
 Kaggle.ipynb
 learning-equality-curriculum-recommendations.zip
 Link_Prediction2.ipynb
 LinkPrediction3.ipynb
 Link_Prediction.ipynb
 MetaPath2Vec.ipynb
 Preprocess.ipynb
 python
 sample_submission.csv
 Spider
 Stanford_NLP_df.csv
 topics.csv
 Untitled1.ipynb
 Untitled2.ipynb
 Untitled.ipynb
 video2graph.ipynb
 WebScraper.ipynb
 weighted_link_prediction.ipynb
 wiki2csv.pickle
 wiki_dict.pickle
 Wikipedia.ipynb
 Wikipedia.py
