# Web Scraper Regular Expression Project

In [25]:
# Import necessary libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

# Define the URL of the webpage containing the speech
url = 'http://www.analytictech.com/mb021/mlk.htm'
# Make an HTTP GET request to fetch the content of the webpage
page = requests.get(url)
# Parse the fetched webpage content into a BeautifulSoup object for easy HTML parsing
soup = BeautifulSoup(page.text, 'html')


# Find all paragraph tags in the HTML which presumably contain the speech text
mlk_speech = soup.find_all('p')
# Extract the text from each paragraph tag and create a list of paragraphs
speech_combined = [p.text for p in mlk_speech]
# Combine the list of paragraphs into a single string with spaces in between
string_speech = ' '.join(speech_combined)
# Clean the string by replacing carriage returns and newlines with spaces
string_speech_cleaned = string_speech.replace('\r\n',' ')
# Remove punctuation from the speech using a regular expression that keeps only words and spaces
speech_no_punt = re.sub(r'[^\w\s]','',string_speech_cleaned)
# Convert the speech text to lowercase to ensure consistent case for word counting
speech_lower = speech_no_punt.lower()
# Split the speech into individual words using regular expressions to identify spaces
speech_broken_out = re.split(r'\s+',speech_lower)


# Create a pandas DataFrame from the list of words and calculate the count of each unique word
df = pd.DataFrame(speech_broken_out).value_counts()
# Convert the DataFrame to a CSV file, setting the column header as 'Counts' and the index label as 'Word'
df.to_csv(r'/Users/joshuacaldwell/Documents/python_tut/crypto_web_puller/mlkj_speech_counts.csv', header = ['Counts'], index_label = 'Word')


# Text Transformation Visuals

In [21]:
soup

<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
<html>
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="Microsoft FrontPage 4.0" name="GENERATOR"/>
<title>Martin Luther King Jr.'s 1962 Speech</title>
</head>
<body alink="#FF0000" bgcolor="#FFFFFF" link="#0000FF" text="#000000" vlink="#551A8B">
<h1><font size="5">Transcript of speech by </font><br/>
Dr. Martin Luther King Jr. <br/>
August 28, 1963. Lincoln Memorial in Washington D.C. </h1>
<hr color="#008080" noshade="" size="5"/>
<p>I am happy to join with you today in what will go down in
history as the greatest demonstration for freedom in the history
of our nation. </p>
<p>Five score years ago a great American in whose symbolic shadow
we stand today signed the Emancipation Proclamation. This
momentous decree came as a great beckoning light of hope to
millions of Negro slaves who had been seared in the flames of
withering injustice. It came as a joyous daybreak to end the long
night of their c

In [22]:
speech_no_punt

'I am happy to join with you today in what will go down in history as the greatest demonstration for freedom in the history of our nation  Five score years ago a great American in whose symbolic shadow we stand today signed the Emancipation Proclamation This momentous decree came as a great beckoning light of hope to millions of Negro slaves who had been seared in the flames of withering injustice It came as a joyous daybreak to end the long night of their captivity  But one hundred years later the Negro is still not free One hundred years later the life of the Negro is still sadly crippled by the manacles of segregation and the chains of discrimination  One hundred years later the Negro lives on a lonely island of poverty in the midst of a vast ocean of material prosperity  One hundred years later the Negro is still languishing in the comers of American society and finds himself in exile in his own land  We all have come to this hallowed spot to remind America of the fierce urgency of

In [23]:
re.split(r'\s+',speech_lower)

['i',
 'am',
 'happy',
 'to',
 'join',
 'with',
 'you',
 'today',
 'in',
 'what',
 'will',
 'go',
 'down',
 'in',
 'history',
 'as',
 'the',
 'greatest',
 'demonstration',
 'for',
 'freedom',
 'in',
 'the',
 'history',
 'of',
 'our',
 'nation',
 'five',
 'score',
 'years',
 'ago',
 'a',
 'great',
 'american',
 'in',
 'whose',
 'symbolic',
 'shadow',
 'we',
 'stand',
 'today',
 'signed',
 'the',
 'emancipation',
 'proclamation',
 'this',
 'momentous',
 'decree',
 'came',
 'as',
 'a',
 'great',
 'beckoning',
 'light',
 'of',
 'hope',
 'to',
 'millions',
 'of',
 'negro',
 'slaves',
 'who',
 'had',
 'been',
 'seared',
 'in',
 'the',
 'flames',
 'of',
 'withering',
 'injustice',
 'it',
 'came',
 'as',
 'a',
 'joyous',
 'daybreak',
 'to',
 'end',
 'the',
 'long',
 'night',
 'of',
 'their',
 'captivity',
 'but',
 'one',
 'hundred',
 'years',
 'later',
 'the',
 'negro',
 'is',
 'still',
 'not',
 'free',
 'one',
 'hundred',
 'years',
 'later',
 'the',
 'life',
 'of',
 'the',
 'negro',
 'is',
 '

In [24]:
df

the        54
of         49
to         29
and        27
a          20
           ..
jews        1
joyous      1
judged      1
land        1
lookout     1
Name: count, Length: 323, dtype: int64