In [1]:
## Imports and global settings
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
from spacy.lang.en import English
from spacytextblob.spacytextblob import SpacyTextBlob
from collections import Counter
import en_core_web_sm
import de_core_news_sm
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_rows', 100)


In [20]:
## Scrape website List of place names of German origin in the United States 
url = 'https://en.wikipedia.org/wiki/List_of_place_names_of_German_origin_in_the_United_States'
html = urlopen(url) 
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table')

place_names = []
states = []
origin_notes = []
place = []
cells_holder = []

for table in tables:
    rows = table.find_all('tr')
    #print(rows)
    
    for row in rows:
        cells = row.find_all('td')
        #cells_holder.append(cells)
       # print(cells)
        
        if len(cells) > 1:
            try:
                place = cells[0]
                place_names.append(place.text.strip())
            except:
                print()
        
            try:
                state = cells[1]
                states.append(state.text.strip())
  
            except:
                print()
            try:
                origin_note = cells[2]
                origin_notes.append(origin_note.text.strip())        
            except:
                print()
                
place_names = place_names[1:]
states = states[1:]
#print(place_names)
#print(len(place_names))
#print(len(states))
#print(states)
#print(len(origin_notes))
#print(origin_notes)


## Re-create the list of German places as it is in Wikipedia
df_german_places = pd.DataFrame()
df_german_places['Place Name'] = place_names
df_german_places['State'] = states
df_german_places['Origin'] = origin_notes

#df_german_places




In [28]:
print(doc_origin_notes)

['Named', 'after', 'Altdorf', ',', 'Switzerland', '.', '[', '1', ']'] ['Named', 'after', 'Altdorf', ',', 'Switzerland', '.', '[', '2', ']'] ['Named', 'after', 'Saxe-Altenburg', '.', '[', '3', ']'] ['a', 'blend', 'of', '``', 'Ana', "''", ',', 'after', 'the', 'nearby', 'Santa', 'Ana', 'River', ',', 'and', 'heim', ',', 'a', 'common', 'Germanic', 'place', 'name', 'compound', 'originally', 'meaning', '``', 'home', "''", '.', '[', '4', ']'] ['Named', 'after', 'the', 'Principality', 'of', 'Anhalt', '.', '[', '5', ']'] ['Named', 'after', 'Princess', 'Augusta', 'of', 'Saxe-Gotha', '.', '[', '6', ']'] ['Named', 'after', 'Augsburg', ',', 'Germany', '.'] ['Named', 'after', 'formerly', 'German-settled', 'Slavkov', 'u', 'Brna', '.'] ['Named', 'after', 'Baden', '.'] ['Named', 'after', 'the', 'Baden', 'region', '.'] ['Named', 'after', 'the', 'Baden', 'region', '.'] ['Named', 'after', 'William', 'Seaborn', 'Bamberg', ',', 'whose', 'grandfather', 'wa', 'an', 'immigrant', 'from', 'Germany', '.', '[', '7'

In [35]:
## Load extracted data into pandas df 
## Extract person names and geographic place names from list of place names of German origin in the United States 
## Print persons of interest with count for further analysis

## Initialize empty pd data frames
df_extracted_original_places = pd.DataFrame()
df_extracted_persons = pd.DataFrame()
list_extracted_original_places=[]
list_extracted_persons=[]

## Extract natural persons and geographic entities
# Convert origin notes from list to string  



origin_notes_str = ' '.join([str(elem) for elem in origin_notes])
nlp = spacy.load("en_core_web_sm")
doc_origin_notes = nlp(origin_notes_str)


for ent in doc_origin_notes.ents:
    if ent.label_ == "PERSON":        
        list_extracted_persons.append(ent.text)
    elif ent.label_ == "GPE":
        list_extracted_original_places.append(ent.text)

## Make places unique
list_extracted_original_places=list(set(list_extracted_original_places))        
        

df_extracted_original_places = pd.DataFrame({'Original Places': list_extracted_original_places})
df_extracted_persons = pd.DataFrame({'Persons of Interest': list_extracted_persons})
#df_extracted_original_places
#df_extracted_persons
#df_extracted_persons['Persons of Interest'].value_counts()

## Export place names to csv (uncomment only if new list needed) but keep!
#df_extracted_original_places.to_csv(r'c:\temp\extracted_original_places.csv', index = False, header = False)

In [None]:
## Functions English

 ##Load available trained pipelines for English


### Another function for stop words removal third attempt
def spacy_stop_word_remover(soup):
    sp = spacy.load('en_core_web_sm')
    all_stopwords = sp.Defaults.stop_words
    text = soup
    words = [word for word in text.split() if word.lower() not in all_stopwords]
    new_text = " ".join(words)
    #print(new_text)
    print("Old length: ", len(text))
    print("New length: ", len(new_text))
    return new_text


## spacy lemmatizer function
def spacy_lemmatize(list_to_lemmatize):
    sp = spacy.load('en_core_web_sm')
    text_to_lemma = sp(list_to_lemmatize)
    text_to_return = ""
    for word in text_to_lemma:
        #print(word.text,  word.lemma_)
        text_to_return=text_to_return + " " + word.lemma_
    return text_to_return

## Simple NLTK sentiment intensity analyser function
def nltk_sentiment_analyser(text):
    input_txt = text
    
    sia = SentimentIntensityAnalyzer()
    polscores=sia.polarity_scores(input_txt)
    return polscores

## Spacy sentiment analyser English

def spacy_sentiment_analyser_en(text_to_analyse):
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe('spacytextblob')
    text=text_to_analyse
    doc = nlp(text)
    print("The polarity of the English text is: " + str(doc._.polarity))      # Polarity: -0.125
    print("The subjectivity of the English text is: " + str(doc._.subjectivity))  # Sujectivity: 0.9
    #doc._.assessments

In [None]:
## Functions German

### Stop words removal German
def spacy_stop_word_remover_german(soup):
    sp = spacy.load('de_core_news_sm')
    all_stopwords = sp.Defaults.stop_words
    text = soup
    words = [word for word in text.split() if word.lower() not in all_stopwords]
    new_text = " ".join(words)
    #print(new_text)
    print("Old length: ", len(text))
    print("New length: ", len(new_text))
    return new_text


## Spacy lemmatizer function German
def spacy_lemmatize_german(list_to_lemmatize):
    sp = spacy.load('de_core_news_sm')
    text_to_lemma = sp(list_to_lemmatize)
    text_to_return = ""
    for word in text_to_lemma:
        #print(word.text,  word.lemma_)
        text_to_return=text_to_return + " " + word.lemma_
    return text_to_return

## Spacy sentiment analyser German
def spacy_sentiment_analyser_de(text_to_analyse):
    nlp = spacy.load('de_core_news_sm')
    nlp.add_pipe('spacytextblob')
    text=text_to_analyse
    doc = nlp(text)
    print("The polarity of the German text is: " + str(doc._.polarity))      # Polarity: -0.125
    print("The subjectivity of the German text is: " + str(doc._.subjectivity))  # Sujectivity: 0.9

In [None]:
## GIT Connect
# Set remote origin
!git init  
#!git remote add origin https://github.com/HGC243/Assignment_3.git
!git remote set-url origin https://github.com/HGC243/Assignment_3.git

In [None]:
## Check GIT Status
print('Status:')
!git status

# Add files to GIT
print('Files to add (This will be empty if there are no new files):')
!git add .

In [None]:
## Git Commit
#!git commit -m "End of day dd/mm/2021 commit"
!git commit -m "End of day dd/mm/2021 commit"

In [None]:
## Git push all
!git push --all

In [None]:
## Scrape website Martin Luther (English)
url_ml = 'https://en.wikipedia.org/wiki/Martin_Luther'
html_ml = urlopen(url_ml) 
soup_ml = BeautifulSoup(html_ml, 'html.parser').get_text()

In [None]:
## Scrape website Martin Luther (German)
url_ml_de = 'https://de.wikipedia.org/wiki/Martin_Luther'
html_ml_de = urlopen(url_ml_de) 
soup_ml_de = BeautifulSoup(html_ml_de, 'html.parser').get_text()


In [None]:
## Steps to cleanse scraped Wikipedia site Martin Luther (English)
## 1. Call stop word remover
no_stop_words_ml=spacy_stop_word_remover(soup_ml)

## 2. Call lemmatizer
ml_lemmatized = spacy_lemmatize(no_stop_words_ml)

## 4. Run sentiment analysis Spacy
spacy_sentiment_analyser_en(ml_lemmatized)  

## 4. Run sentiment analysis NLTK (English only)
ml_sentiment=nltk_sentiment_analyser(ml_lemmatized)
print("The polarity scores are:\n")
print(ml_sentiment)

In [None]:
## Steps to cleanse scraped Wikipedia site Martin Luther (German)
no_stop_words_ml_de =spacy_stop_word_remover_german(soup_ml_de)
ml_lemmatized_de = spacy_lemmatize_german(no_stop_words_ml_de)
spacy_sentiment_analyser_de(ml_lemmatized_de)

In [None]:
## Scrape website Otto von Bismarck (English)
url_ovb = 'https://en.wikipedia.org/wiki/Otto_von_Bismarck'
html_ovb = urlopen(url_ovb) 
soup_ovb = BeautifulSoup(html_ovb, 'html.parser').get_text()

In [None]:
## Scrape website Otto von Bismarck (German)
url_ovb_de = 'https://de.wikipedia.org/wiki/Otto_von_Bismarck'
html_ovb_de = urlopen(url_ovb_de) 
soup_ovb_de = BeautifulSoup(html_ovb_de, 'html.parser').get_text()

In [None]:
## Steps to cleanse scraped Wikipedia site Otto von Bismarck (English)
no_stop_words_ovb=spacy_stop_word_remover(soup_ovb)
ovb_lemmatized = spacy_lemmatize(no_stop_words_ovb)
spacy_sentiment_analyser_en(ovb_lemmatized)  
ovb_sentiment=nltk_sentiment_analyser(ovb_lemmatized)
print("The polarity scores are:\n")
print(ovb_sentiment)

In [None]:
## Steps to cleanse scraped Wikipedia site Otto von Bismarck (German)
no_stop_words_ovb_de=spacy_stop_word_remover(soup_ovb_de)
ovb_de_lemmatized = spacy_lemmatize_german(no_stop_words_ovb_de)
spacy_sentiment_analyser_de(ovb_de_lemmatized)  
ovb_de_sentiment=nltk_sentiment_analyser(ovb_de_lemmatized)
print("The polarity scores are:\n")
print(ovb_de_sentiment)

In [40]:
## Scrape website Alexander von Humboldt (English)
url_avh = 'https://en.wikipedia.org/wiki/Alexander_von_Humboldt'
html_avh = urlopen(url_avh) 
soup_avh = BeautifulSoup(html_avh, 'html.parser').get_text()

In [None]:
## Scrape website Alexander von Humboldt (German)
url_avh_de = 'https://de.wikipedia.org/wiki/Alexander_von_Humboldt'
html_avh_de = urlopen(url_avh_de) 
soup_avh_de = BeautifulSoup(html_avh_de, 'html.parser').get_text()

In [None]:
## Steps to cleanse scraped Wikipedia site Otto von Bismarck (English)
no_stop_words_avh=spacy_stop_word_remover(soup_avh)
avh_lemmatized = spacy_lemmatize(no_stop_words_avh)
spacy_sentiment_analyser_en(avh_lemmatized)  
avh_sentiment=nltk_sentiment_analyser(avh_lemmatized)
print("The polarity scores are:\n")
print(avh_sentiment)

In [None]:
## Steps to cleanse scraped Wikipedia site Otto von Bismarck (German)
no_stop_words_avh_de=spacy_stop_word_remover(soup_avh_de)
avh_de_lemmatized = spacy_lemmatize_german(no_stop_words_avh_de)
spacy_sentiment_analyser_de(avh_de_lemmatized)  
avh_de_sentiment=nltk_sentiment_analyser(avh_de_lemmatized)
print("The polarity scores are:\n")
print(avh_de_sentiment)

In [6]:
## Unused draft functions
## I have left these here just to show how much time I have spent working through this whilst trying things out

#def spacy_stop_word_remover(soup):
#    sp = spacy.load('en_core_web_sm')
#    soup_for_processing = soup
#    all_stopwords = sp.Defaults.stop_words
#    text_tokens = word_tokenize(soup_for_processing)
#    tokens_without_sw= [word for word in text_tokens if not word in all_stopwords]
#    return tokens_without_sw


### NOT WORKING WELL

#
#   # Create list of word tokens
#    token_list = []
#    for token in doc:        
#        token_list.append(token.text)
#
#    filtered_text =[] 
#
#    for word in token_list:
#        lexeme = nlp.vocab[word]
#        if lexeme.is_stop == False:
#            #print(word)
#            filtered_text.append(word) 
#
#    return filtered_text

In [3]:
## Simple lemmatizer Function (only used for list of places)

def simple_lemmatizer(list_of_words):
    to_lemmatize=list_of_words
    lmtzr = WordNetLemmatizer()
    lemmatized = [[lmtzr.lemmatize(word) for word in word_tokenize(s)]for s in to_lemmatize]
    return lemmatized
#print(lemmatized)

# getting length of list
#length = len(lemmatized)
#data = []  

#for i in range(length):
#    string = ' '.join(lemmatized[i])
#    #print(string)
#    data.append(string)


In [44]:
## Started test to strip out persons and GPE from wiki text blob

nlp = spacy.load("en_core_web_sm")
doc = nlp(soup_avh)
list_extracted_text=""
for ent in doc.ents:
    if ent.label_ == "PERSON":        
        #print()
        continue
    elif ent.label_ == "GPE":
        #print()
        continue
    else:
        list_extracted_text=list_extracted_text + " " + ent.text
        
print(list_extracted_text)

 Humboldt - Wikipedia Wikipedia 1769–1859 1843)Born14 September 1769Berlin German ConfederationResting School of Mines 1792)University no degree)University of Göttingen 1845–1862 Humboldt Current Humboldtian Thoreau Emerson 14 September 1769 May 1859 German Prussian 1767–1835).[6][7][8 Humboldt Humboldt Between 1799 and 1804 Humboldt Americas first 21 years Humboldt first the Atlantic Ocean South America Africa Humboldt Greek one first 1800 1831 2 Travels Europe Spanish American 1799–1804 3.1 3.2 Spanish 1799 3.3 1799–1800 3.4 1800 1804 Andes 1801–1803 3.6 1803–1804 1804 3.8 Travel 3.9 Latin American 1829 6.1 7 7.1 8 Humboldt 8.1 Humboldt Prussian 8.3 9.1 Honours
 Humboldt Humboldt Humboldt 9.6 9.6.1 Alexander von Humboldt Foundation 9.9 Dedications 9.10 10 10.1 10.2 11 14 14.1 15 15.1 15.2 15.3 Humboldt 14 September 1769.[15 Humboldt Humboldt Pomeranian the Prussian Army age 42 the Seven Years' War first Prussian 1766 Alexander Georg three two first 1779 Humboldt Kantian one Prussian 

In [None]:

def text_cleanser(text):
    nlp = spacy.load("en_core_web_sm")
    text_to_cleanse = text
    listToStr = ' '.join([str(elem) for elem in text])
    doc = nlp(listToStr)
    #doc = nlp(text_to_cleanse)
    return doc
text_cleaned=text_cleaner(origin_notes_lemmatized)