In [None]:
#*
# If you are NOT using Google Colab you'll need to run this cell to install spacy and its model
import sys
!{sys.executable} -m pip install spacy
!{sys.executable} -m spacy download en_core_web_sm

# Text Analysis 1
## SC290 - Week 23


In [5]:
#*
import requests
from bs4 import BeautifulSoup

# New library!
import re

In this section we are going to scrape chapter 1 of The Communist Manifesto by Karl Marx, and clean it ready for text analysis later.

The source: [https://www.marxists.org/archive/marx/works/1848/communist-manifesto/ch01.htm](https://www.marxists.org/archive/marx/works/1848/communist-manifesto/ch01.htm)

In [6]:
#*
url = "https://www.marxists.org/archive/marx/works/1848/communist-manifesto/ch01.htm"


In [7]:
# Pull the HTML from the URL
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')


In [8]:
# First 2000 characters of our downloaded web text
r.text[:2000]

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\r\n        "http://www.w3.org/TR/2000/REC-xhtml1-20000126/DTD/xhtml1-transitional.dtd">\r\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\r\n<head>\r\n      <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />\r\n      <meta name="Author" content="Karl Marx and Frederick Engels" />\r\n      <meta name="Description" content="History of the Bourgeois and Proletarian class" />\r\n      <meta name="Classification" content="History, Politics" />\r\n      <meta name="Keywords" content="Communist Manifesto, Communism, Manifesto" />\r\n      <link rel="stylesheet" type="text/css" href="../../../../../css/border-red.css" /> \r\n<title>Communist Manifesto (Chapter 1)</title>\r\n<meta name="viewport" content="width=device-width"/></head>\r\n<body> \r\n\r\n<p class="title">\r\n<a href="../../../../../index.htm" class="title">MIA</a>: <a href="../../../../index.htm" class="title">Marxists</a>:

In [9]:
# Paragraphs in HTML are marked up with the <p> element so we will use BeautifulSoup to grab every <p>paragraph</p>
# On this specific page, if a p element has a class associated with it, it is because it is something other than the body of the text
# By selecting class_=None this excludes any paragraphs that are unusual
p_all = soup.find_all("p", class_=None)

# And just save the text inside each paragraph element

paragraphs = [p.text for p in p_all]
# First 10 paragraphs
paragraphs[:10]

['\r\nA spectre is haunting Europe — the spectre of\r\ncommunism. All the powers of old Europe have entered into a holy alliance\r\nto exorcise this spectre: Pope and Tsar, Metternich and Guizot, French\r\nRadicals and German police-spies.\r\n ',
 '\r\nWhere is the party in opposition that has not been decried as communistic\r\nby its opponents in power? Where is the opposition that has not hurled\r\nback the branding reproach of communism, against the more advanced opposition\r\nparties, as well as against its reactionary adversaries?\r\n ',
 '\r\nTwo things result from this fact:\r\n ',
 '\r\nTo this end, Communists of various nationalities have assembled in London\r\nand sketched the following manifesto, to be published in the English, French,\r\nGerman, Italian, Flemish and Danish languages.\r\n ',
 ' \r\nThe history of all hitherto existing society(2) is the history of class struggles.\r\n ',
 '\r\nFreeman and slave, patrician and plebeian, lord\r\nand serf, guild-master(3) and\r\

In [10]:
#*
cleaned = []
for para in paragraphs:
    # Remove all \r
    clean1 = re.sub(pattern="\r", repl="", string=para)

    # Replace \n with a space (\n indicates a new line and not providing a space joins words together)
    clean2 = re.sub(pattern="\n", repl=" ", string=clean1)
    
    # Remove digits
    clean3 = re.sub(pattern="\d", repl="", string=clean2)

    # Remove parenthesis
    # In regex parentheses mean something special, so to actually tell it to look for
    # parentheses we have to "escape" the character, which we do by putting a \ in front of it.
    clean4 = re.sub(pattern="\(\)", repl="", string=clean3)
    
    # Clean out any white space
    clean5 = clean4.strip()

    cleaned.append(clean5)

# When it comes to scraping your own text and cleaning it, your needs might be different. 
# Giacomo's slides have additional Regex techniques that can be very powerful.
cleaned


['A spectre is haunting Europe — the spectre of communism. All the powers of old Europe have entered into a holy alliance to exorcise this spectre: Pope and Tsar, Metternich and Guizot, French Radicals and German police-spies.',
 'Where is the party in opposition that has not been decried as communistic by its opponents in power? Where is the opposition that has not hurled back the branding reproach of communism, against the more advanced opposition parties, as well as against its reactionary adversaries?',
 'Two things result from this fact:',
 'To this end, Communists of various nationalities have assembled in London and sketched the following manifesto, to be published in the English, French, German, Italian, Flemish and Danish languages.',
 'The history of all hitherto existing society is the history of class struggles.',
 'Freeman and slave, patrician and plebeian, lord and serf, guild-master and journeyman, in a word, oppressor and oppressed, stood in constant opposition to one a

In [11]:
# Join the cleaned texts together
text = '\n'.join(cleaned)
print(text)

# Wait - why did we add \n back in!? - because other packages like spacy use the presence of new lines as important indicators of text structure.
# The original text had them all over to lay it out on their page. We are introducing them to indicate paragraph breaks.
# Compare with the original web text. If they need to be removed later they can always be stripped out again with re.sub

A spectre is haunting Europe — the spectre of communism. All the powers of old Europe have entered into a holy alliance to exorcise this spectre: Pope and Tsar, Metternich and Guizot, French Radicals and German police-spies.
Where is the party in opposition that has not been decried as communistic by its opponents in power? Where is the opposition that has not hurled back the branding reproach of communism, against the more advanced opposition parties, as well as against its reactionary adversaries?
Two things result from this fact:
To this end, Communists of various nationalities have assembled in London and sketched the following manifesto, to be published in the English, French, German, Italian, Flemish and Danish languages.
The history of all hitherto existing society is the history of class struggles.
Freeman and slave, patrician and plebeian, lord and serf, guild-master and journeyman, in a word, oppressor and oppressed, stood in constant opposition to one another, carried on an 

In [12]:
#*
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

KeyboardInterrupt: 

In [None]:
# Spacy can tell us how many 'tokens' are in the document - i.e. how many words (but also other things)
len(doc)

In [None]:
# How many sentences in the document?
len(list(doc.sents))

In [None]:
#*
# Tokens are units of text in natural language processing. Exactly how a text is 'tokenised' varies depending on the tool
# and many debates are had about the best way to do it.

# The goal is to render a text down into individual units of information that can be processed by different analysis techniques

# This is how spacy breaks up the document
[w.text for w in doc]

In [None]:

#*# Spacy uses the context of the surrounding words and grammar to work out if the word is a noun, verb, adjective etc.
# They call this the 'part-of-speech' or POS
[(w.text, w.pos_) for w in doc]

In [None]:
#*
# Spacy tokens have helpful attributes...
# Is it alphabetical (i.e not numerical or punctuation)
[(w.text, w.is_alpha) for w in doc]

In [None]:
#*
# Is it punctuation? 
[(w.text, w.is_punct) for w in doc]

In [None]:
#*
# # Is it a stop word? 
[(w.text, w.is_stop) for w in doc]

### Stop Words?
Stop words are typically defined as the most common words in a language. Often incredibly common words can make it harder to find patterns in text. For example the most common words in a piece of text might be 'the', 'a', 'and' etc. That doesn't tell us much about the text even though the result is correct.

In [None]:
#*
# These are the stop words for this model
print(nlp.Defaults.stop_words)


In [None]:
# We can use these token attributes to filter our text based on what type of token it is
[w.text for w in doc if not w.is_stop and not w.is_punct]

### Lemmatization

A word's lemma is the simpler 'root' word that best represents the word's meaning. It reduces the possible range of words whilst still ensuring the words left convey the appropriate meaning.

To make this clearer we can use some examples:

In [None]:
#*
# Here we have essentially the same sentences, just a variation in that one uses a contraction "don't" rather than "do not".
rabbit_1 = nlp("I don't like rabbits in space")
rabbit_2 = nlp("I do not like rabbits in space")
print( [token.lemma_ for token in rabbit_1])
print( [token.lemma_ for token in rabbit_2])


In [None]:
#*
# Even differing text can be brought at least closer in similarity using lemmas, reducing loving to love
rabbit_1 = nlp("I'm loving these rabbits")
rabbit_2 = nlp("I love this rabbit!")

print( [token.lemma_ for token in rabbit_1])
print( [token.lemma_ for token in rabbit_2])

If you are doing any text analysis that counts the frequency of words, relies on word similarity etc, it is usually a good idea to reduce the range of words being used so long as it can retain the same underlying semantic meaning.

In [None]:
filtered_tokens = [w.lemma_.lower() for w in doc if not w.is_stop and not w.is_punct and not w.is_space]
filtered_tokens

In [None]:
from collections import Counter
counts = Counter(filtered_tokens)
counts.most_common(10)

In [None]:
# If you want to convert your filtered tokens to text you simply join them together again
## .is_space filters out "spaces" which includes newline (\n) symbols

filtered_text = " ".join(filtered_tokens)
filtered_text

# Exercise A
Can you scrape, clean and use spacy to process "What are the Origins of May Day?" by Rosa Luxemburg?

[https://www.marxists.org/archive/luxemburg/1894/02/may-day.htm](https://www.marxists.org/archive/luxemburg/1894/02/may-day.htm)

# Exercise B
If you have your dataset from SC207 you could examine the body (i.e. the actual text) of the articles you used. The Guardian API provides the body as its original HTML so you'll still need to clean it using Beautiful Soup. This was covered in SC207's advanced analysis notebooks. The basics are below...

In [None]:
#*
import pandas as pd

my_articles = pd.read_parquet('filtered_cleaned_articles.parquet')
my_articles['body'].head()

In [None]:
#*
def html_cleaner(text):
    soup = BeautifulSoup(text,'html')
    return soup.text

texts = my_articles['body'].apply(html_cleaner).tolist()
# How many documents in my list of texts?

print(len(texts))
# What does the first one look like now?
print(texts[0])

# A note
Keep in mind that in this session we focussed on retrieving, cleaning and analysing a single piece of text. In your Guardian data you have lots of texts so to practice you can either choose one document from your `texts` list, or `join` them all together to make one massive document (or if you're feeling clever, filter the documents first using what you learned in SC207 - i.e. all articles in the X section, all articles in a certain date range etc.).

In [None]:
#*
single_document = texts[0]

# I'll just take the first 10
joined_documents = "\n".join(texts[:10])

#... then carry on with your cleaning and processing