In [29]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
import pickle
from glob import glob
import datetime
import requests
import time
from bs4 import BeautifulSoup as bs
import re

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException

import os
from time import sleep
from joblib import dump, load

# Project Gutenberg

In [30]:
# Content was scraped using a wget provided by the website

In [31]:
files = glob('../data/raw/Gutenberg/txt/*.txt')

book_df = pd.DataFrame(columns=['text', 'info', 'date', 'target'])

In [32]:
def collect_book_info(files):
    '''
    Input: list of files
    Output: first section of each file which contains book information
    '''
    book_info = []
    for file in files:
        open_file = open(file, 'r', encoding= "ISO-8859-1")
        read_file = open_file.read()[:300]
        read_file = read_file.replace(' ', '').split()
        book_info.append(read_file)

    return book_info


book_information = collect_book_info(files)

In [33]:
def collect_book_text(files):
    '''
    Input: list of files
    Output: portion of each file that contains excerpt
    '''
    book_text = []
    for file in files:
        open_file = open(file, 'r', encoding= "ISO-8859-1")
        read_file = open_file.read()[1500:2300].splitlines()
        book_text.append(read_file)

    return book_text

book_text = collect_book_text(files)

In [34]:
# Adding information to dataframe
book_df['info'] = book_information
book_df['text'] = book_text

In [38]:
def delete_half_words(raw_text):
    '''
    Input: column in df that has excerpts extracted from files
    Output: excerpts with first and last words dropped to elimiate half words
    '''
    words = str(raw_text).split()
    result = words[1:-2]
    return result

book_df['text'] = book_df['text'].apply(delete_half_words)

In [40]:
#Extracting Date Information
def create_date(book_info):
    date_info = []
    for info in book_information:
        info = str(info)
        info = re.findall(r'(\d{4})', info)
        date_info.append(info)

    return date_info

book_dates = create_date(book_information)
book_df['date'] = book_dates

book_df = book_df.replace('[]', np.nan)

In [42]:
book_df['date'] = book_df['date'].astype(str)

In [43]:
book_df['date'] = book_df['date'].str.strip('[]')
book_df['date'] = book_df['date'].str.strip("''")

In [44]:
mask = (book_df['date'].str.len() > 4)

len(mask)
book_df[mask] = book_df.loc[mask].date = 'NaN'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


# Stylist Magazine Article

In [49]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import re


url = 'http://stylist.co.uk/books/the-best-100-closing-lines-from-books/123681'
res = requests.get(url)
soup = bs(res.content, 'lxml')

In [50]:
books = []
for link in soup.find_all('h3', {'class': 'css-r1u8am'}):
    ta_dict = {}
    ta_dict['title'] = link.text
    books.append(ta_dict)

In [51]:
quotes = []
for link in soup.find_all('div', {'class': 'css-dbbd7o'}):
    tr_dict = {}
    tr_dict['quote'] = link.text
    quotes.append(tr_dict)

In [52]:
book_info = pd.DataFrame(books, quotes)
book_info = book_info.reset_index()

In [53]:
list_of_quotes = book_info['index'].tolist()

In [64]:
def extract_quote(list_of_quotes):
    '''
    Input: List of dictionaries
    Output: Unpacked dictionary, quotes (values) saved to list'''
    quotes = []
    num = len(list_of_quotes)
    for i in range(0, num):
        new_quote = list_of_quotes[i]['quote'].splitlines()[0]
        quotes.append(new_quote)
    
    return quotes


In [55]:
book_info['index'] = extract_quote(list_of_quotes)

In [56]:
book_info = book_info.rename(columns={'index': 'text'})

In [57]:
df = book_info[book_info['text'].map(len) > 100]

In [58]:
df = df.reset_index()

In [59]:
df = df.drop(columns='index')

In [60]:
df['target'] = 'NaN'
df['date'] = 'NaN'

In [None]:
df.to_csv('quote_scrape.csv', index=False)

# Reddit

In [None]:
# url = 'https://www.reddit.com/r/books/comments/1mqfzt/what_is_the_most_powerful_chapter_paragraph_or/.json'
# url = 'https://thoughtcatalog.com/charlie-shaw/2013/09/34-profound-excerpts-from-classic-literature-that-will-change-your-day/.json'
url = 'https://www.reddit.com/r/books/comments/35wv34/whats_the_most_beautiful_paragraph_or_sentence/.json'

In [None]:
headers = {'User-Agent': 'My User Agent 1.0'}

In [None]:
def fetch_page(url, after=''):
    params = {'after': after}
    response = requests.get(url, headers=headers, params=params)
    return response.json()[1]['data']['children']

In [None]:
def parse_post(post):
    keep = ['subreddit', 'title', 'body', 'name'] 
    return {k:v for k, v in post['data'].items() if k in keep}

In [None]:
def parse_page(page):
    after = ''
    posts = []
    for post in page:
        post = parse_post(post)
        after = post['name']
        posts.append(post)
    return posts, after

In [None]:
def fetch_subreddit(subreddit, pages=4):
    url = subreddit
    after = ''
    all_posts = []
    for i in range(pages):
        print(f'Fetching Page {i + 1}')
        page = fetch_page(url, after)
        posts, after = parse_page(page)
        all_posts.extend(posts)
        time.sleep(5)
    return all_posts

In [None]:
posts = fetch_subreddit(url, pages=50)

In [None]:
reddit_df = pd.DataFrame(posts)

In [None]:
reddit_df = reddit_df.dropna()
reddit_df = reddit_df.drop_duplicates()

In [None]:
def clean_posts(posts):
    new_posts = []
    for post in posts:
        post = post.replace('\n', '')
        new_posts.append(post)
        
    return new_posts

In [None]:
text = reddit_df['body'].tolist()

In [None]:
reddit_df['body'] = clean_posts(text)

In [66]:
def split_column(df, col):
    '''
    Input: dataframe and specified column
    Output: column split into two columns by the last existence of a dash (how a majority of redditor's split excerpt and book)'''
    new = df[col].str.rsplit('-', n=1, expand=True)
    df['text'] = new[0]
    df['info'] = new[1]
    
    return df

df = split_column(reddit_df, 'body')

In [None]:
df = df.dropna()

In [None]:
df2 = pd.read_csv('reddit_data.csv', index_col=0)

In [None]:
df = df.append(df2)

In [None]:
# df.to_csv('reddit_data.csv')

In [None]:
# df.to_csv('reddit_df.csv')

# Date Scraping (Selenium) 

In [67]:
df = pd.read_csv('../data/processed/reddit_data.csv', index_col=0)
# browser = webdriver.Firefox()

In [68]:
df = df.drop_duplicates(subset=['text'])

In [69]:
df2 = pd.read_csv('../data/processed/quote_scrape.csv')

In [70]:
df = df.drop(columns=['name', 'subreddit', 'body'])

In [71]:
df2 = df2.drop(columns=['target', 'date'])
df2 = df2.rename(columns={'title': 'info'})

In [72]:
df = df.append(df2)

In [73]:
df = df.reset_index()

In [74]:
df = df.drop(columns=['index'])

In [75]:
df = df.drop_duplicates(subset=['text'])

In [76]:
title_list = df['info'].tolist()

In [77]:
search_queries = []
for item in title_list:
    search_queries.append(f'when was {item} published?')

In [78]:
#Scrape for Google.com

dates  = []
browser = webdriver.Firefox()
for p in search_queries:
    browser.get('http://www.google.com')
    search = browser.find_element_by_name('q')
    search.send_keys(f'"{p}"')
    search.send_keys(Keys.RETURN) # hit return after you enter search text
    time.sleep(60) #sleep for 20 seconds
    
    try:
        result = browser.find_element_by_xpath('.//div[@class="Z0LcW"]') 
        result = result.get_attribute('innerHTML') #if date number exists, append to list
    except:
        result = 'NaN' #if date number does not exist, append "NaN"

    dates.append(result)
browser.quit()

# Googles selenium scraper dealt with recaptchas more often so the sleeps had to be longer, and it would need to be 
#stopped and restarted at the index spot where it left off. 

In [80]:
def extract_years(text):
    new_list_dates = []
    for t in text:
        date = re.findall(r'(\d{4})', t)
        new_list_dates.append(date)
                          
    return new_list_dates

In [81]:
df['google_dates'] = extract_years(dates)

In [85]:
df

Unnamed: 0,text,info,google_dates
0,"Really, the whole paragraph is good, but in pa...",A Farewell to Arms,[1929]
1,"""I looked at the stars, and considered how awf...","Dickens, Great Expectations",[1861]
2,"""As the days went by, the evolution of like in...",Jack London,[1916]
3,"""Where else? I belong to a lost generation and...","Umberto Eco, Foucault's Pendulum",[1988]
4,&gt;Have you ever been in love? Horrible isn't...,"The Sandman, Neil Gaiman",[]
5,"Some men are born mediocre, some men achieve m...",Catch 22,[1961]
6,Down there are people who will follow any drag...,The Patrician Vetinari,[]
7,This sentence has five words. Here are five mo...,Gary Provost,[]
8,e live in time - it holds us and molds us - bu...,Sense of an Ending by Julian Barnes,[2011]
9,"Out of the little grove, away from the baffled...","The Amber Spyglass, Phillip Paulman",[2000]


In [None]:
# Scrape for DuckDuckGo.com

In [98]:
dd_sq = []
for item in title_list:
    dd_sq.append(f'what date was {item} published?')

In [None]:
dd_date = []

browser = webdriver.Firefox()
for p in dd_sq:
    browser.get('http://www.duckduckgo.com')
    search = browser.find_element_by_xpath(".//input[@id='search_form_input_homepage']")
    search.send_keys(f'{p}')
    search.send_keys(Keys.RETURN) # hit return after you enter search text
    time.sleep(5) # sleep for 5 seconds 
    
    try:
        result = browser.find_element_by_xpath('.//span[@class="js-about-item-abstr"]')
        result = result.get_attribute('innerHTML')
    except:
        result = 'NaN'

    dd_date.append(result)

    
browser.quit()

In [None]:
dd_date
df['duck_dates'] = extract_years(dd_date)

In [None]:
#Scrape for Ask.com

ask_dates = []

browser = webdriver.Firefox()
for p in dd_sq:
    browser.get('http://www.ask.com')
    search = browser.find_element_by_name('q')
    search.send_keys(f'{p}')
    search.send_keys(Keys.RETURN) # hit return after you enter search text
    time.sleep(10) # sleep for 5 seconds so you can see the results
    
    try:
        result = browser.find_element_by_xpath('.//p[@class="PartialSearchResults-item-abstract"]')
        result = result.get_attribute('innerHTML')
    except:
        result = 'NaN'

    ask_dates.append(result)
browser.quit()


In [None]:
df['ask_dates'] = extract_years(ask_dates)

In [97]:
df.to_csv('../data/processed/reddit_df_with_scraped_dates.csv')

In [96]:
df.head()

Unnamed: 0,text,info,duck_dates,ask_dates,google_dates
0,"Really, the whole paragraph is good, but in pa...",A Farewell to Arms,['1929'],['1929'],[1929]
1,"""I looked at the stars, and considered how awf...","Dickens, Great Expectations",[],"['1860', '1861']",[1861]
2,"""As the days went by, the evolution of like in...",Jack London,[],['2019'],[1916]
3,"""Where else? I belong to a lost generation and...","Umberto Eco, Foucault's Pendulum",['1988'],"['1988', '1988', '1989']",[1988]
4,&gt;Have you ever been in love? Horrible isn't...,"The Sandman, Neil Gaiman",[],[],[]


# Online Liberty Fund (NonFiction)

In [None]:
url = 'https://oll.libertyfund.org/groups/44'
res = requests.get(url)

soup = bs(res.content, 'lxml')

In [None]:
slugs = []
links_for_books = []
list_of_sites = []
text_info = []

for link in soup.find_all('li'):
    l = link.find('a')
    slugs.append(l)

for row in slugs:
    r = str(row)
    link = re.findall(r'"(.*?)"', r)
    links_for_books.append(link)

for slug in links_for_books:
    slugname = str(slug[0])
    link = f'https://oll.libertyfund.org/titles{slugname}'
    print(f'scraping{link}')
       
    res = requests.get(link)
    soup = bs(res.content, 'lxml')
    words = soup.get_text()[1000:2000]
    text = words.strip()
    text = text.replace('\n', '')
    text_info.append(text)
    time.sleep(10)

In [None]:
olf_df = pd.DataFrame(text_info)
# olf_df.to_csv('olf.csv')