In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait

import pandas as pd
import re


from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [6]:
def scraping(url):
    options = Options()
    options.headless = True
    options.add_argument("--window-size=1920,1200")
    driver = webdriver.Chrome(options=options, executable_path='/usr/local/bin')

    driver.get(url)
    paragraphs = driver.find_elements(By.TAG_NAME, 'p')
    lst = []
    for p in paragraphs:
        lst.append(p.text)
    return lst
    driver.quit()

In [7]:
url = 'https://www.gutenberg.org/cache/epub/16389/pg16389-images.html'

In [8]:
article_lst = scraping(url)

  options.headless = True
  driver = webdriver.Chrome(options=options, executable_path='/usr/local/bin')
Error getting version of chromedriver 115. Retrying with chromedriver 114 (attempt 1/5)


In [9]:
len(article_lst)

1569

In [10]:
df = pd.DataFrame(data = article_lst, columns= ['original_doc'])

In [11]:
df.head()

Unnamed: 0,original_doc
0,Title: The Enchanted April
1,Author: Elizabeth Von Arnim
2,"Release date: July 29, 2005 [eBook #16389]\nMo..."
3,Language: English
4,Credits: Manette Rothermel


In [12]:
df['doc_lower'] = df['original_doc'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,original_doc,doc_lower
0,Title: The Enchanted April,title: the enchanted april
1,Author: Elizabeth Von Arnim,author: elizabeth von arnim
2,"Release date: July 29, 2005 [eBook #16389]\nMo...","release date: july 29, 2005 [ebook #16389]\nmo..."
3,Language: English,language: english
4,Credits: Manette Rothermel,credits: manette rothermel


In [13]:
def extract_words(text):
    pattern = r'[A-Za-z0-9]+'
    matches = re.findall(pattern, text)
    extracted_words = ' '.join(matches)
    
    return extracted_words

df['doc_words'] = df['doc_lower'].apply(lambda x: extract_words(x))
df.head()

Unnamed: 0,original_doc,doc_lower,doc_words
0,Title: The Enchanted April,title: the enchanted april,title the enchanted april
1,Author: Elizabeth Von Arnim,author: elizabeth von arnim,author elizabeth von arnim
2,"Release date: July 29, 2005 [eBook #16389]\nMo...","release date: july 29, 2005 [ebook #16389]\nmo...",release date july 29 2005 ebook 16389 most rec...
3,Language: English,language: english,language english
4,Credits: Manette Rothermel,credits: manette rothermel,credits manette rothermel


In [14]:
df['doc_tokenized'] = df['doc_words'].apply(lambda x: word_tokenize(x))
df.head()

Unnamed: 0,original_doc,doc_lower,doc_words,doc_tokenized
0,Title: The Enchanted April,title: the enchanted april,title the enchanted april,"[title, the, enchanted, april]"
1,Author: Elizabeth Von Arnim,author: elizabeth von arnim,author elizabeth von arnim,"[author, elizabeth, von, arnim]"
2,"Release date: July 29, 2005 [eBook #16389]\nMo...","release date: july 29, 2005 [ebook #16389]\nmo...",release date july 29 2005 ebook 16389 most rec...,"[release, date, july, 29, 2005, ebook, 16389, ..."
3,Language: English,language: english,language english,"[language, english]"
4,Credits: Manette Rothermel,credits: manette rothermel,credits manette rothermel,"[credits, manette, rothermel]"


In [15]:
def stop_words_rm(word_tokens):
    own_stop = ['a', 'href', 'https', 'www', 'youtube','com']
    stop_words = stopwords.words('english') + own_stop
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    return filtered_sentence

df['doc_stopwords'] = df['doc_tokenized'].apply(lambda x: stop_words_rm(x))
df.head()

Unnamed: 0,original_doc,doc_lower,doc_words,doc_tokenized,doc_stopwords
0,Title: The Enchanted April,title: the enchanted april,title the enchanted april,"[title, the, enchanted, april]","[title, enchanted, april]"
1,Author: Elizabeth Von Arnim,author: elizabeth von arnim,author elizabeth von arnim,"[author, elizabeth, von, arnim]","[author, elizabeth, von, arnim]"
2,"Release date: July 29, 2005 [eBook #16389]\nMo...","release date: july 29, 2005 [ebook #16389]\nmo...",release date july 29 2005 ebook 16389 most rec...,"[release, date, july, 29, 2005, ebook, 16389, ...","[release, date, july, 29, 2005, ebook, 16389, ..."
3,Language: English,language: english,language english,"[language, english]","[language, english]"
4,Credits: Manette Rothermel,credits: manette rothermel,credits manette rothermel,"[credits, manette, rothermel]","[credits, manette, rothermel]"


In [None]:
def lem(stwd_rm_lst):
    lem_sent = []
    wnl = WordNetLemmatizer()
    for word in stwd_rm_lst:
        word = wnl.lemmatize(word, pos="n")
        lem_sent.append(word) 
    return lem_sent


df['doc_lem'] = df['doc_stopwords'].apply(lambda x: lem(x))
df.head()

In [35]:
df['document_processed'] = df['doc_lem'].apply(lambda x: " ".join(x))
df = df[['document_processed']].reset_index(drop = True)
df.head()

Unnamed: 0,document_processed
0,title familiar talk science world building lif...
1,author elisha gray
2,release date august 11 2010 ebook 33405
3,language english
4,credit produced chris curnow josephine paolucc...


In [36]:
df.to_csv('the_enchanted_april.csv')