# Imports and global settings

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
from spacy.lang.en import English
#from spacytextblob.spacytextblob import SpacyTextBlob
from collections import Counter
import en_core_web_sm
import de_core_news_sm
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_rows', 100)


# Scrape wikipedia website List of place names of German origin in the United States using BeautifulSoup 

In [48]:
url = 'https://en.wikipedia.org/wiki/List_of_place_names_of_German_origin_in_the_United_States'
html = urlopen(url) 
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table')

place_names = []
states = []
origin_notes = []
place = []
cells_holder = []

for table in tables:
    rows = table.find_all('tr')
    #print(rows)
    
    for row in rows:
        cells = row.find_all('td')
        #cells_holder.append(cells)
       # print(cells)
        
        if len(cells) > 1:
            try:
                place = cells[0]
                place_names.append(place.text.strip())
            except:
                print()
        
            try:
                state = cells[1]
                states.append(state.text.strip())
  
            except:
                print()
            try:
                origin_note = cells[2]
                origin_notes.append(origin_note.text.strip())        
            except:
                print()
                
place_names = place_names[1:]
states = states[1:]
#print(place_names)
#print(len(place_names))
#print(len(states))
#print(states)
#print(len(origin_notes))
#print(origin_notes)


## Re-create the list of German places as it is in Wikipedia
df_german_places = pd.DataFrame()
df_german_places['Place Name'] = place_names
df_german_places['State'] = states
df_german_places['Origin'] = origin_notes

#df_german_places




## Convert extracted soup into something useful for the further analysis

In [90]:
## Load extracted data into pandas df 
## Extract person names and geographic place names from list of place names of German origin in the United States 
## Print persons of interest with count for further analysis

## Initialize empty pd data frames
df_extracted_original_places = pd.DataFrame()
df_extracted_persons = pd.DataFrame()
list_extracted_original_places=[]
list_extracted_persons=[]

## Extract natural persons and geographic entities
# Convert origin notes from list to string  



origin_notes_str = ' '.join([str(elem) for elem in origin_notes])
nlp = spacy.load("en_core_web_sm")
doc_origin_notes = nlp(origin_notes_str)


for ent in doc_origin_notes.ents:
    if ent.label_ == "PERSON":        
        list_extracted_persons.append(ent.text)
    elif ent.label_ == "GPE":
        list_extracted_original_places.append(ent.text)

## Make places unique
list_extracted_original_places=list(set(list_extracted_original_places))        
        

df_extracted_original_places = pd.DataFrame({'Place Names': list_extracted_original_places})
df_extracted_persons = pd.DataFrame({'Persons of Interest': list_extracted_persons})
#df_extracted_original_places
#df_extracted_persons
#df_extracted_persons['Persons of Interest'].value_counts()



# Print results of initial analysis for evaluation

In [10]:
## Print extracted persons of interest nad count their times of appearance in the origin description
df_extracted_persons['Persons of Interest'].value_counts()

Johann de Kalb                 6
Alexander von Humboldt         4
George III                     2
Frederick Muhlenberg           2
Fulda                          2
Martin Luther Named            2
William Hoehne                 1
Brna                           1
Kiel                           1
Gustav Schleicher              1
Francis Xavier Pierz           1
Lenzburg                       1
William Waldorf Astor          1
Albert Etter                   1
Otto Fischer                   1
Dissen                         1
John Kieler                    1
Henry Wickenburg               1
Adolph Hegewisch               1
Prince Carl                    1
Ludwig Börne                   1
John Meiners                   1
Compound Wald                  1
Adelsverein                    1
Otto von Bismarck              1
Jonathan Hager                 1
John A. Roebling               1
P. J.                          1
Kassel                         1
Henry C. Lutkens[11]:77        1
Saxe-Alten

In [11]:
## Print extracted original places
df_extracted_original_places

Unnamed: 0,Original Places
0,California
1,Colorado
2,Karlsbad
3,Ault
4,Baden
5,Otoe
6,Germany
7,Lützen
8,Nebraska
9,Minden


## Export place names to csv (uncomment only if new list needed)

In [None]:
df_extracted_original_places.to_csv(r'c:\temp\extracted_original_places.csv', index = False, header = False)

# Define the functions for further analysis of the websites that are scraped
# based off the results of the initial scrape and investigation

## Functions English

In [15]:
### Another spacy function for stop words removal third attempt
def spacy_stop_word_remover(soup):
    sp = spacy.load('en_core_web_sm')
    all_stopwords = sp.Defaults.stop_words
    text = soup
    words = [word for word in text.split() if word.lower() not in all_stopwords]
    new_text = " ".join(words)
    #print(new_text)
    print("Old length: ", len(text))
    print("New length: ", len(new_text))
    return new_text


## spacy lemmatizer function
def spacy_lemmatize(list_to_lemmatize):
    sp = spacy.load('en_core_web_sm')
    text_to_lemma = sp(list_to_lemmatize)
    text_to_return = ""
    for word in text_to_lemma:
        #print(word.text,  word.lemma_)
        text_to_return=text_to_return + " " + word.lemma_
    return text_to_return

## Simple NLTK sentiment intensity analyser function
def nltk_sentiment_analyser(text):
    input_txt = text
    
    sia = SentimentIntensityAnalyzer()
    polscores=sia.polarity_scores(input_txt)
    return polscores

## Spacy sentiment analyser English

def spacy_sentiment_analyser_en(text_to_analyse):
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe('spacytextblob')
    text=text_to_analyse
    doc = nlp(text)
    print("The polarity of the English text is: " + str(doc._.polarity)) 
    print("The subjectivity of the English text is: " + str(doc._.subjectivity))
    #doc._.assessments

## Functions German

In [16]:
### Stop words removal German
def spacy_stop_word_remover_german(soup):
    sp = spacy.load('de_core_news_sm')
    all_stopwords = sp.Defaults.stop_words
    text = soup
    words = [word for word in text.split() if word.lower() not in all_stopwords]
    new_text = " ".join(words)
    #print(new_text)
    print("Old length: ", len(text))
    print("New length: ", len(new_text))
    return new_text


## Spacy lemmatizer function German
def spacy_lemmatize_german(list_to_lemmatize):
    sp = spacy.load('de_core_news_sm')
    text_to_lemma = sp(list_to_lemmatize)
    text_to_return = ""
    for word in text_to_lemma:
        #print(word.text,  word.lemma_)
        text_to_return=text_to_return + " " + word.lemma_
    return text_to_return

## Spacy sentiment analyser German
def spacy_sentiment_analyser_de(text_to_analyse):
    nlp = spacy.load('de_core_news_sm')
    nlp.add_pipe('spacytextblob')
    text=text_to_analyse
    doc = nlp(text)
    print("The polarity of the German text is: " + str(doc._.polarity))
    print("The subjectivity of the German text is: " + str(doc._.subjectivity))

# GIT Connection and upload

In [136]:
## GIT Connect
# Set remote origin
!git init  
#!git remote add origin https://github.com/HGC243/Assignment_3.git
!git remote set-url origin https://github.com/HGC243/Assignment_3.git

Reinitialized existing Git repository in C:/Users/hgc24/OneDrive/Master_Data_Science/Data Science Master Class 1/ass3/.git/


In [137]:
## Check GIT Status
print('Status:')
!git status

# Add files to GIT
print('Files to add (This will be empty if there are no new files):')
!git add .

Status:
On branch master
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   .ipynb_checkpoints/Assignment_3_Code_Holger_Colin-checkpoint.ipynb
	modified:   Assignment_3_Code_Holger_Colin.ipynb

no changes added to commit (use "git add" and/or "git commit -a")
Files to add (This will be empty if there are no new files):


The file will have its original line endings in your working directory
The file will have its original line endings in your working directory


In [138]:
## Git Commit
#!git commit -m "End of day dd/mm/2021 commit"
!git commit -m "End of day 15/04/2021 commit"

[master 7e8f66a] End of day 15/04/2021 commit
 2 files changed, 2714 insertions(+), 1416 deletions(-)
 rewrite .ipynb_checkpoints/Assignment_3_Code_Holger_Colin-checkpoint.ipynb (72%)
 rewrite Assignment_3_Code_Holger_Colin.ipynb (72%)


In [139]:
## Git push all
!git push --all

To https://github.com/HGC243/Assignment_3.git
   5abf346..7e8f66a  master -> master


# Scrape further websites based off the results of the initial investigations

## Scrape, cleanse and analyse Martin Luther Wikipedia site

In [12]:
## Scrape website Martin Luther (English)
url_ml = 'https://en.wikipedia.org/wiki/Martin_Luther'
html_ml = urlopen(url_ml) 
soup_ml = BeautifulSoup(html_ml, 'html.parser').get_text()

In [13]:
## Scrape website Martin Luther (German)
url_ml_de = 'https://de.wikipedia.org/wiki/Martin_Luther'
html_ml_de = urlopen(url_ml_de) 
soup_ml_de = BeautifulSoup(html_ml_de, 'html.parser').get_text()

In [17]:
## Steps to cleanse scraped Wikipedia site Martin Luther (English)
## 1. Call stop word remover
no_stop_words_ml=spacy_stop_word_remover(soup_ml)

## 2. Call lemmatizer
ml_lemmatized = spacy_lemmatize(no_stop_words_ml)

## 4. Run sentiment analysis Spacy
spacy_sentiment_analyser_en(ml_lemmatized)  

## 4. Run sentiment analysis NLTK (English only)
ml_sentiment=nltk_sentiment_analyser(ml_lemmatized)
print("The polarity scores are:\n")
print(ml_sentiment)

Old length:  144714
New length:  115136
The polarity of the English text is: 0.05408380164687502
The subjectivity of the English text is: 0.35739486049119074
The polarity scores are:

{'neg': 0.085, 'neu': 0.825, 'pos': 0.091, 'compound': 0.9753}


In [18]:
## Steps to cleanse scraped Wikipedia site Martin Luther (German)
no_stop_words_ml_de =spacy_stop_word_remover_german(soup_ml_de)
ml_lemmatized_de = spacy_lemmatize_german(no_stop_words_ml_de)
spacy_sentiment_analyser_de(ml_lemmatized_de)

Old length:  174091
New length:  133974
The polarity of the German text is: 0.02128378378378378
The subjectivity of the German text is: 0.3673423423423423


## Scrape, cleanse and analyse Otto von Bismarck Wikipedia site

In [19]:
## Scrape website Otto von Bismarck (English)
url_ovb = 'https://en.wikipedia.org/wiki/Otto_von_Bismarck'
html_ovb = urlopen(url_ovb) 
soup_ovb = BeautifulSoup(html_ovb, 'html.parser').get_text()

In [20]:
## Scrape website Otto von Bismarck (German)
url_ovb_de = 'https://de.wikipedia.org/wiki/Otto_von_Bismarck'
html_ovb_de = urlopen(url_ovb_de) 
soup_ovb_de = BeautifulSoup(html_ovb_de, 'html.parser').get_text()

In [21]:
## Steps to cleanse scraped Wikipedia site Otto von Bismarck (English)
no_stop_words_ovb=spacy_stop_word_remover(soup_ovb)
ovb_lemmatized = spacy_lemmatize(no_stop_words_ovb)
spacy_sentiment_analyser_en(ovb_lemmatized)  
ovb_sentiment=nltk_sentiment_analyser(ovb_lemmatized)
print("The polarity scores are:\n")
print(ovb_sentiment)

Old length:  142473
New length:  111623
The polarity of the English text is: 0.07353583132348823
The subjectivity of the English text is: 0.3385020426652225
The polarity scores are:

{'neg': 0.108, 'neu': 0.766, 'pos': 0.126, 'compound': 0.9997}


In [22]:
## Steps to cleanse scraped Wikipedia site Otto von Bismarck (German)
no_stop_words_ovb_de=spacy_stop_word_remover(soup_ovb_de)
ovb_de_lemmatized = spacy_lemmatize_german(no_stop_words_ovb_de)
spacy_sentiment_analyser_de(ovb_de_lemmatized)  
ovb_de_sentiment=nltk_sentiment_analyser(ovb_de_lemmatized)
print("The polarity scores are:\n")
print(ovb_de_sentiment)

Old length:  161404
New length:  158798
The polarity of the German text is: 0.06275252525252527
The subjectivity of the German text is: 0.40881895881895874
The polarity scores are:

{'neg': 0.009, 'neu': 0.986, 'pos': 0.005, 'compound': -0.9974}


## Scrape, cleanse and analyse Alexander von Humboldt Wikipedia site

In [23]:
## Scrape website Alexander von Humboldt (English)
url_avh = 'https://en.wikipedia.org/wiki/Alexander_von_Humboldt'
html_avh = urlopen(url_avh) 
soup_avh = BeautifulSoup(html_avh, 'html.parser').get_text()

In [24]:
## Scrape website Alexander von Humboldt (German)
url_avh_de = 'https://de.wikipedia.org/wiki/Alexander_von_Humboldt'
html_avh_de = urlopen(url_avh_de) 
soup_avh_de = BeautifulSoup(html_avh_de, 'html.parser').get_text()

In [26]:
## Steps to cleanse scraped Wikipedia site Otto von Bismarck (English)
no_stop_words_avh=spacy_stop_word_remover(soup_avh)
avh_lemmatized = spacy_lemmatize(no_stop_words_avh)
spacy_sentiment_analyser_en(avh_lemmatized)  
avh_sentiment=nltk_sentiment_analyser(avh_lemmatized)
print("The polarity scores are:\n")
print(avh_sentiment)

Old length:  150924
New length:  121614
The polarity of the English text is: 0.08755867451903791
The subjectivity of the English text is: 0.3547645066594741
The polarity scores are:

{'neg': 0.033, 'neu': 0.875, 'pos': 0.092, 'compound': 1.0}


In [27]:
## Steps to cleanse scraped Wikipedia site Otto von Bismarck (German)
no_stop_words_avh_de=spacy_stop_word_remover(soup_avh_de)
avh_de_lemmatized = spacy_lemmatize_german(no_stop_words_avh_de)
spacy_sentiment_analyser_de(avh_de_lemmatized)  
avh_de_sentiment=nltk_sentiment_analyser(avh_de_lemmatized)
print("The polarity scores are:\n")
print(avh_de_sentiment)

Old length:  193665
New length:  190585
The polarity of the German text is: 0.04963556014692378
The subjectivity of the German text is: 0.25022956841138644
The polarity scores are:

{'neg': 0.012, 'neu': 0.98, 'pos': 0.008, 'compound': -0.9978}


## Scrape, cleanse List of German place names Wikipedia site

In [72]:
## Scrape website list of places in Germany (German)
url_lpg_de = 'https://de.wikipedia.org/wiki/Liste_der_St%C3%A4dte_in_Deutschland'
html_lpg_de = urlopen(url_lpg_de) 
soup_lpg_de = BeautifulSoup(html_lpg_de, 'html.parser')
tables = soup_lpg_de.find_all('table')

In [87]:
## Convert content of website into cleansed list of place names
place_name_holder=[]
for table in tables:
    rows = table.find_all('tr')
    #print(rows)
        

    for row in rows:
        place_lines = row.find_all('dd')
        #place_name_holder.append(place_lines)
        #place_name_holder = place_name_holder + " " + place_lines
        #print(place_lines)
        
        for names in place_lines:
            place_names = names.find_all('a')
            
            
            to_string=str(place_names)
            extracted_place=re.findall(r'>([^"]*)<', to_string)
            #print(place_names)
            #print(extracted_place)
            place_name_holder.append(extracted_place)
            
df_allplaces_Germany = pd.DataFrame(place_name_holder)
df_allplaces_Germany=df_allplaces_Germany.rename(columns={0: 'Place Names'})
#df_allplaces_Germany

In [135]:
## Compare both German place name lists and output matching records only 
verfied_place_names = pd.merge(df_allplaces_Germany, df_extracted_original_places, how='inner', on=['Place Names'])
verfied_place_names

Unnamed: 0,Place Names
0,Augsburg
1,Berlin
2,Flensburg
3,Hamburg
4,Jena
5,Karlsruhe
6,Leipzig
7,Lützen
8,Melle
9,Minden


# Unused draft functions

In [None]:

## I have left these here just to show how much time I have spent working through this whilst trying things out
## This doesn't look like much but all of these failed attempts to get functions to work cost lots of time
## This is only the second subject in Python and the first one is a while back so the learning curve is steep

#def spacy_stop_word_remover(soup):
#    sp = spacy.load('en_core_web_sm')
#    soup_for_processing = soup
#    all_stopwords = sp.Defaults.stop_words
#    text_tokens = word_tokenize(soup_for_processing)
#    tokens_without_sw= [word for word in text_tokens if not word in all_stopwords]
#    return tokens_without_sw


### NOT WORKING WELL

#
#   # Create list of word tokens
#    token_list = []
#    for token in doc:        
#        token_list.append(token.text)
#
#    filtered_text =[] 
#
#    for word in token_list:
#        lexeme = nlp.vocab[word]
#        if lexeme.is_stop == False:
#            #print(word)
#            filtered_text.append(word) 
#
#    return filtered_text



## Simple lemmatizer Function 
def simple_lemmatizer(list_of_words):
    to_lemmatize=list_of_words
    lmtzr = WordNetLemmatizer()
    lemmatized = [[lmtzr.lemmatize(word) for word in word_tokenize(s)]for s in to_lemmatize]
    return lemmatized
#print(lemmatized)

# getting length of list
#length = len(lemmatized)
#data = []  

#for i in range(length):
#    string = ' '.join(lemmatized[i])
#    #print(string)
#    data.append(string)
