In [29]:
import requests
from bs4 import BeautifulSoup as bsoup

"""This first section collects the data that we will use to train our model. Essentially we start with the URL below as our starting point and we scrape all the URLs on the given wikipedia page.
This gives us a long list of URls each correspodning to a wikipedia page. From there we scrape all the links of each of those pages. Then we scrape the text from each page and tokenize the text down to individual sentences.
The list of sentences is the training data we will use for our model"""

def collect_urls(url):
    response = requests.get(url)
    soup = bsoup(response.text, 'lxml')
    body = soup.find(id="bodyContent") #grab the body of the wiki page
    paragraphs = body.find_all('p') #select all the paragraphs in the body

    valid_urls = [] #create empty list to store URLs
    for paragraph in paragraphs: #iterate through each paragraph
        for link in paragraph.find_all('a'): #find all links
            href = link.get('href')
            if href is not None and 'https' not in href and '#cite_note' not in href: #we only want links that go to other wiki pages, which will only cite the extension, not any new link starting with http
                url = 'https://en.m.wikipedia.org'+href #append the link to the starter URL identified above
                valid_urls.append(url) #append the url the empty list 

    return list(set(valid_urls))

url = "https://en.wikipedia.org/wiki/History_of_the_United_States"

base_page_urls = collect_urls(url) #collect urls from starting page 

all_urls = [] #create new list to store all the urls
all_urls.extend(base_page_urls) #extend the base page urls to the new list 

for url in base_page_urls: #iterate through the base page urls and pass each one to the function abovee
    new_urls = collect_urls(url)
    all_urls.extend(new_urls) #extend the urls from each page from the base page urls to the list of all urls 


all_urls = list(set(all_urls)) #get rid of duplicates

print(len(all_urls)) #this is ~105,000 urls 

104503


In [35]:
#iterate through all the URLs we scraped and grab all the text from each page, tokenizing the text into sentences
import pandas as pd
# urls = pd.read_csv('/Users/loganbarger/Documents/University of Denver/Data Science Tools 1/DSTFinalProject/url_data.csv')
# all_urls = urls['URLs'].tolist()
#THIS WILL TAKE ~10 HOURS TO RUN
# data = []
# for i in range(0,len(all_urls)):
#     print(i)
#     temp_url = all_urls[i]

#     temp_response = requests.get(temp_url)

#     temp_soup = bsoup(temp_response.text, 'lxml')
#     temp_body = temp_soup.find(id='bodyContent')
#     temp_paragraphs = temp_body.find_all('p')

#     for paragraph in temp_paragraphs:
#         sentences = sent_tokenize(paragraph.text) #tokenize the paragraph into sentences
#         for sentence in sentences:
#             data.append(sentence) #append each sentence to the data list 

In [36]:
"""In this section we will be cleaning and processing our data. We will be removing grammar that isn't needed, stopwords, lemmatizing the words in the sentences, tagging words etc."""
import pandas as pd
test_df = pd.read_csv('/Users/loganbarger/Documents/University of Denver/Data Science Tools 1/DSTFinalProject/project_data.csv', index_col='index')
# test_df = pd.DataFrame({'original_text': data}) #create df to store our data
pd.set_option('display.max_colwidth', None)
test_df.head()

Unnamed: 0_level_0,original_text
index,Unnamed: 1_level_1
0,The causes of the Great Depression in the early 20th century in the United States have been extensively discussed by economists and remain a matter of active debate.
1,[1] They are part of the larger debate about economic crises and recessions.
2,The specific economic events that took place during the Great Depression are well established.
3,"There was an initial stock market crash that triggered a ""panic sell-off"" of assets."
4,"This was followed by a deflation in asset and commodity prices, dramatic drops in demand and the total quantity of money in the economy, and disruption of trade, ultimately resulting in widespread unemployment (over 13 million people were unemployed by 1932) and impoverishment."


In [37]:
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer
import spacy

nltk.download("words") 
nltk.download('punkt')
nltk.download('wordnet')

# ! python3 -m download en_core_web_sm

#remove footnote citations and \n from text
def remove_patterns(text):
    text = re.sub(r'\[\d+\]\s*', '', text)
    text = re.sub(r'\n', '', text)
    return text

#remove stopwords
def remove_stopwords(text): #this fucntion removes stopwords 
    stop_words = set(stopwords.words('english'))
    words = text.split()

    non_stopwords = [word for word in words if word.lower() not in stop_words]

    new_text = str(' '.join(non_stopwords))
    return new_text

def lemmatization(docs):
    processed_docs = list(nlp.pipe(docs)) #process the list that is passed
    
    lemmatized_sentences = [] #initialize a list to store results
    
    for doc in processed_docs: #iterate through the list that has been processed 
        lemmatized_words = [token.lemma_ for token in doc if not token.is_punct and not token.is_space] #lemmatize and make new sentence with lemmatized version of words
        lemmatized_sentence = ' '.join(lemmatized_words)
        lemmatized_sentences.append(lemmatized_sentence)
    
    return lemmatized_sentences

#First we will remove unwanted punctuation, then we will lemmatize, then we will remove stopwords

test_df['original_text'] = test_df['original_text'].apply(remove_patterns) #remove unwanted punctuation 

#removing footnotes made some of the values blank, we will get rid of those now
test_df = test_df[test_df['original_text'] != '']
test_df = test_df[test_df['original_text'] != '"'] #because some quotes are longer than a sentence, when the quote is tokenized, there is a dangling " left over
test_df = test_df.reset_index(drop=True) #reset index

#lemmatize the words
nlp = spacy.load('en_core_web_sm', disable=['ner']) #load spacy's english language model
list_of_data = test_df['original_text'].tolist() #create a list of the text to be lemmatized


lemmatized_list = lemmatization(list_of_data) #passing a list to this function is a little faster than using the apply method
test_df['lemmatized'] = lemmatized_list 

#remove stopwords
test_df['no_stopwords'] = test_df['lemmatized'].apply(remove_stopwords) #create new column that has no stopwords

test_df.head()

[nltk_data] Downloading package words to
[nltk_data]     /Users/loganbarger/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/loganbarger/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/loganbarger/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,original_text,lemmatized,no_stopwords
0,The causes of the Great Depression in the early 20th century in the United States have been extensively discussed by economists and remain a matter of active debate.,the cause of the Great Depression in the early 20th century in the United States have be extensively discuss by economist and remain a matter of active debate,cause Great Depression early 20th century United States extensively discuss economist remain matter active debate
1,They are part of the larger debate about economic crises and recessions.,they be part of the large debate about economic crisis and recession,part large debate economic crisis recession
2,The specific economic events that took place during the Great Depression are well established.,the specific economic event that take place during the great Depression be well establish,specific economic event take place great Depression well establish
3,"There was an initial stock market crash that triggered a ""panic sell-off"" of assets.",there be an initial stock market crash that trigger a panic sell off of asset,initial stock market crash trigger panic sell asset
4,"This was followed by a deflation in asset and commodity prices, dramatic drops in demand and the total quantity of money in the economy, and disruption of trade, ultimately resulting in widespread unemployment (over 13 million people were unemployed by 1932) and impoverishment.",this be follow by a deflation in asset and commodity price dramatic drop in demand and the total quantity of money in the economy and disruption of trade ultimately result in widespread unemployment over 13 million people be unemployed by 1932 and impoverishment,follow deflation asset commodity price dramatic drop demand total quantity money economy disruption trade ultimately result widespread unemployment 13 million people unemployed 1932 impoverishment
5,"However, economists and historians have not reached a consensus on the causal relationships between various events and government economic policies in causing or ameliorating the Depression.",however economist and historian have not reach a consensus on the causal relationship between various event and government economic policy in cause or ameliorate the Depression,however economist historian reach consensus causal relationship various event government economic policy cause ameliorate Depression
6,Current mainstream theories may be broadly classified into two main points of view.,current mainstream theory may be broadly classify into two main point of view,current mainstream theory may broadly classify two main point view
7,"The first are the demand-driven theories, from Keynesian and institutional economists who argue that the depression was caused by a widespread loss of confidence that led to drastically lower investment and persistent underconsumption.",the first be the demand drive theory from keynesian and institutional economist who argue that the depression be cause by a widespread loss of confidence that lead to drastically low investment and persistent underconsumption,first demand drive theory keynesian institutional economist argue depression cause widespread loss confidence lead drastically low investment persistent underconsumption
8,"The demand-driven theories argue that the financial crisis following the 1929 crash led to a sudden and persistent reduction in consumption and investment spending, causing the depression that followed.",the demand drive theory argue that the financial crisis follow the 1929 crash lead to a sudden and persistent reduction in consumption and investment spending cause the depression that follow,demand drive theory argue financial crisis follow 1929 crash lead sudden persistent reduction consumption investment spending cause depression follow
9,"Once panic and deflation set in, many people believed they could avoid further losses by keeping clear of the markets.",once panic and deflation set in many people believe they could avoid further loss by keep clear of the market,panic deflation set many people believe could avoid loss keep clear market


In [78]:
lemmatized_data = pd.read_csv('/Users/loganbarger/Documents/University of Denver/Data Science Tools 1/DSTFinalProject/lemmatized_data.csv')

print(lemmatized_data.dtypes)

original_text                  object
lemmatized                     object
no_stopwords                   object
avg_word_len_with_stopwords     int64
avg_word_len_no_stopwords       int64
dtype: object


In [79]:
"""This section is the EDA of our text data. We will look at word frequency, sentence length frequency, average sentence length, word length frequency, average word length, 
what percent of the sentences are stopwords, visualizing most common n-grams, creating a wordcloud, sentiment analysis using textblob, visualizing tagging frequencies, 
exploring sentence complexity etc"""

#Some sentences when lemmatized turn into only stop words, so when stopwords are removed, a blank value is given to the 'no_stopwords' column, lets remove those 
lemmatized_data = lemmatized_data[lemmatized_data['no_stopwords'] != '']
lemmatized_data = lemmatized_data.dropna(subset=['no_stopwords'])
lemmatized_data = lemmatized_data.reset_index(drop=True) #reset index

def count_words(text):
    text = text.split()
    return len(text)
lemmatized_data['word_count'] = lemmatized_data['original_text'].apply(count_words)

def avg_word_len(text):
    string = text.split()
    if len(string) == 0:
        return 0
    return round(sum(len(word) for word in string) / len(string))

lemmatized_data['avg_word_len_with_stopwords'] = lemmatized_data['original_text'].apply(avg_word_len)
lemmatized_data['avg_word_len_no_stopwords'] = lemmatized_data['no_stopwords'].apply(avg_word_len)

#Let's look at what percent of the sentences are stopwords
def percent_stopwords(text):

    return

#Let's visualize what we have found so far 
lemmatized_data.head(60)




Unnamed: 0,original_text,lemmatized,no_stopwords,avg_word_len_with_stopwords,avg_word_len_no_stopwords,word_count
0,The causes of the Great Depression in the early 20th century in the United States have been extensively discussed by economists and remain a matter of active debate.,the cause of the Great Depression in the early 20th century in the United States have be extensively discuss by economist and remain a matter of active debate,cause Great Depression early 20th century United States extensively discuss economist remain matter active debate,5,7,28
1,They are part of the larger debate about economic crises and recessions.,they be part of the large debate about economic crisis and recession,part large debate economic crisis recession,5,6,12
2,The specific economic events that took place during the Great Depression are well established.,the specific economic event that take place during the great Depression be well establish,specific economic event take place great Depression well establish,6,6,14
3,"There was an initial stock market crash that triggered a ""panic sell-off"" of assets.",there be an initial stock market crash that trigger a panic sell off of asset,initial stock market crash trigger panic sell asset,5,6,14
4,"This was followed by a deflation in asset and commodity prices, dramatic drops in demand and the total quantity of money in the economy, and disruption of trade, ultimately resulting in widespread unemployment (over 13 million people were unemployed by 1932) and impoverishment.",this be follow by a deflation in asset and commodity price dramatic drop in demand and the total quantity of money in the economy and disruption of trade ultimately result in widespread unemployment over 13 million people be unemployed by 1932 and impoverishment,follow deflation asset commodity price dramatic drop demand total quantity money economy disruption trade ultimately result widespread unemployment 13 million people unemployed 1932 impoverishment,5,7,43
5,"However, economists and historians have not reached a consensus on the causal relationships between various events and government economic policies in causing or ameliorating the Depression.",however economist and historian have not reach a consensus on the causal relationship between various event and government economic policy in cause or ameliorate the Depression,however economist historian reach consensus causal relationship various event government economic policy cause ameliorate Depression,6,8,26
6,Current mainstream theories may be broadly classified into two main points of view.,current mainstream theory may be broadly classify into two main point of view,current mainstream theory may broadly classify two main point view,5,6,13
7,"The first are the demand-driven theories, from Keynesian and institutional economists who argue that the depression was caused by a widespread loss of confidence that led to drastically lower investment and persistent underconsumption.",the first be the demand drive theory from keynesian and institutional economist who argue that the depression be cause by a widespread loss of confidence that lead to drastically low investment and persistent underconsumption,first demand drive theory keynesian institutional economist argue depression cause widespread loss confidence lead drastically low investment persistent underconsumption,6,8,33
8,"The demand-driven theories argue that the financial crisis following the 1929 crash led to a sudden and persistent reduction in consumption and investment spending, causing the depression that followed.",the demand drive theory argue that the financial crisis follow the 1929 crash lead to a sudden and persistent reduction in consumption and investment spending cause the depression that follow,demand drive theory argue financial crisis follow 1929 crash lead sudden persistent reduction consumption investment spending cause depression follow,6,7,29
9,"Once panic and deflation set in, many people believed they could avoid further losses by keeping clear of the markets.",once panic and deflation set in many people believe they could avoid further loss by keep clear of the market,panic deflation set many people believe could avoid loss keep clear market,5,5,20


In [26]:
"""In this section we will create our model and test to see how well it works"""


271980
271980
