Given an URL, by the user
1) Fetch the textual contents from the webpage
2) Propose all possible cleaning and prepare a text file
3) Prepare a list of paragraphs, sentences, words, unique words, links

In [20]:
import requests
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [21]:
r = requests.get('https://en.wikipedia.org/wiki/Artemis_program')
soup = BeautifulSoup(r.content, 'html.parser')
s = soup.find('div', class_='mw-body-content') # finding out the main div and paragraphs
paras = s.find_all('p')

<Response [200]>


In [25]:
paragraphs = []
# re = re.compile(r'\s+', re.UNICODE)
for line in paras:
    text = re.sub(r'\[.*?\]', '', line.text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'[\\]+','', text)
    text = re.sub('\n','',text)
    if len(text) != 0:
        paragraphs.append(text)

In [26]:
paragraphs

["The Artemis program is a Moon exploration program that is led by the United States' NASA and was formally established in 2017 via Space Policy Directive 1. The Artemis program is intended to reestablish a human presence on the Moon for the first time since Apollo 17 in 1972. The program's stated long-term goal is to establish a permanent base on the Moon to facilitate human missions to Mars.",
 'Two principal elements of the Artemis program are derived from the now-cancelled Constellation program: the Orion spacecraft and the Space Launch System (as a reincarnation of Ares V). Other elements of the program, such as the Lunar Gateway space station and the Human Landing System, are in development by government space agencies and private spaceflight companies. This collaboration is bound together by the Artemis Accords and governmental contracts.',
 'The Space Launch System, Orion spacecraft and the Human Landing System form the main spaceflight infrastructure for Artemis, and the Lunar

In [27]:
with open('scrape.txt', 'w') as scrape:
    for line in paragraphs:
        scrape.write(line)

Finding links

In [34]:
soup = BeautifulSoup(r.content, 'html.parser') 
  
# find all the anchor tags with "href"
links = []
for link in soup.find_all('a'): 
    ans = re.findall(r'https?://\S+', str(link.get('href')))
    if len(ans) != 0:
        links.append(ans)

In [35]:
links

[['https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en'],
 ['https://af.wikipedia.org/wiki/Artemis-program'],
 ['https://ar.wikipedia.org/wiki/%D8%A8%D8%B1%D9%86%D8%A7%D9%85%D8%AC_%D8%A3%D8%B1%D8%AA%D9%8A%D9%85%D8%B3'],
 ['https://an.wikipedia.org/wiki/Programa_Artemis'],
 ['https://az.wikipedia.org/wiki/Artemida_proqram%C4%B1'],
 ['https://bn.wikipedia.org/wiki/%E0%A6%86%E0%A6%B0%E0%A7%8D%E0%A6%9F%E0%A7%87%E0%A6%AE%E0%A6%BF%E0%A6%B8_%E0%A6%AA%E0%A7%8D%E0%A6%B0%E0%A7%8B%E0%A6%97%E0%A7%8D%E0%A6%B0%E0%A6%BE%E0%A6%AE'],
 ['https://bg.wikipedia.org/wiki/%D0%90%D1%80%D1%82%D0%B5%D0%BC%D0%B8%D0%B4%D0%B0_(%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%B0)'],
 ['https://bs.wikipedia.org/wiki/Artemis_program'],
 ['https://br.wikipedia.org/wiki/Programm_Artemis'],
 ['https://ca.wikipedia.org/wiki/Programa_Artemis'],
 ['https://cv.wikipedia.org/wiki/%D0%90%D1%80%D1%82%D0%B5%D0%BC%D0%B8%D0%B4%D0%B0_(%D0%BA%

In [36]:
sentences = []
for lines in paragraphs:
    for line in lines.split('.'):
        if len(line) != 0:
            sentences.append(line)

sentences

["The Artemis program is a Moon exploration program that is led by the United States' NASA and was formally established in 2017 via Space Policy Directive 1",
 ' The Artemis program is intended to reestablish a human presence on the Moon for the first time since Apollo 17 in 1972',
 " The program's stated long-term goal is to establish a permanent base on the Moon to facilitate human missions to Mars",
 'Two principal elements of the Artemis program are derived from the now-cancelled Constellation program: the Orion spacecraft and the Space Launch System (as a reincarnation of Ares V)',
 ' Other elements of the program, such as the Lunar Gateway space station and the Human Landing System, are in development by government space agencies and private spaceflight companies',
 ' This collaboration is bound together by the Artemis Accords and governmental contracts',
 'The Space Launch System, Orion spacecraft and the Human Landing System form the main spaceflight infrastructure for Artemis,

In [37]:
words = []
for line in sentences:
    for word in line.split(' '):
        if len(word) != 0:
            words.append(word)
words

['The',
 'Artemis',
 'program',
 'is',
 'a',
 'Moon',
 'exploration',
 'program',
 'that',
 'is',
 'led',
 'by',
 'the',
 'United',
 "States'",
 'NASA',
 'and',
 'was',
 'formally',
 'established',
 'in',
 '2017',
 'via',
 'Space',
 'Policy',
 'Directive',
 '1',
 'The',
 'Artemis',
 'program',
 'is',
 'intended',
 'to',
 'reestablish',
 'a',
 'human',
 'presence',
 'on',
 'the',
 'Moon',
 'for',
 'the',
 'first',
 'time',
 'since',
 'Apollo',
 '17',
 'in',
 '1972',
 'The',
 "program's",
 'stated',
 'long-term',
 'goal',
 'is',
 'to',
 'establish',
 'a',
 'permanent',
 'base',
 'on',
 'the',
 'Moon',
 'to',
 'facilitate',
 'human',
 'missions',
 'to',
 'Mars',
 'Two',
 'principal',
 'elements',
 'of',
 'the',
 'Artemis',
 'program',
 'are',
 'derived',
 'from',
 'the',
 'now-cancelled',
 'Constellation',
 'program:',
 'the',
 'Orion',
 'spacecraft',
 'and',
 'the',
 'Space',
 'Launch',
 'System',
 '(as',
 'a',
 'reincarnation',
 'of',
 'Ares',
 'V)',
 'Other',
 'elements',
 'of',
 'the'

In [38]:
unique_words = set(words)
unique_words

{'target',
 'situ',
 'extravehicular',
 'Mars,',
 'shifts',
 'XL',
 'tested',
 'July',
 "doesn't",
 'chief',
 'After',
 'services',
 'up',
 'missions;',
 '20km',
 'twelve',
 '1,',
 'ground',
 'Inspector',
 'deployed',
 'vehicle,',
 'addition',
 'follow-on',
 'officials',
 'laying',
 'shot"',
 'Program',
 'The',
 'Obama-era',
 'partners',
 'an',
 'winning',
 'utilized',
 'Direct",',
 'Corps,',
 'Agency',
 'advance',
 'judge',
 'carrying',
 'Appropriations',
 'S',
 'PR,',
 'South',
 'EVA',
 'phase',
 'transport',
 'them',
 'projects,',
 'Japanese',
 'deliver',
 'graduated',
 'presented',
 'Office',
 'Outpost',
 'generating',
 'compatible',
 'permanent',
 'through',
 'agreement',
 'specified',
 'conducting',
 'Main',
 'ARM',
 'SLS',
 'System,',
 'lead',
 'delays',
 'Europe',
 'scientific',
 're-enter',
 'Redirect',
 'enough',
 'concerted,',
 'option',
 'cover',
 'depart',
 'further',
 'connection',
 'large',
 'Station',
 're-entered',
 'announcing',
 '(ARM)',
 'members',
 'degrees',
 'any

In [33]:
print(len(unique_words))

2062
