In [60]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import re

tweets = [
    "This is introduction to NLP",
    "It is likely to be useful, to people",
    "Machine learning is the new electricity",
    "There would be less hype around AI and more action going forward",
    "Python is the best tool!",
    "R is a good language",
    "I like this book",
    "I want more books like this"
    ]

#f_tweets = []
#for t in tweets:
#    f_tweets.append(t.lower())
#f_tweets

# alternative:
f_tweets = [t.lower() for t in tweets]

tweets_df = pd.DataFrame(f_tweets, columns=['tweets'])
tweets_df

# or lower the Dataframe itself:
#df['tweets'] = df['tweets'].str.lower()
#df['tweets'].apply(lambda x: x.lower())

Unnamed: 0,tweets
0,this is introduction to nlp
1,"it is likely to be useful, to people"
2,machine learning is the new electricity
3,there would be less hype around ai and more ac...
4,python is the best tool!
5,r is a good language
6,i like this book
7,i want more books like this


In [2]:

# --- with REPLACE --------
#f_tweets = [t.lower().replace(',', '').replace('!', '') for t in tweets]


# --- with REGEX ----------
# ^ invert
# \w all characters from a-Z, all numbers, underscore _ (no whitespace!!! -> \s needed)
# \s whitespace

f_tweets = [re.sub(r'[^\w\s]', '', t.lower()) for t in tweets]
f_tweets

tweets_df = pd.DataFrame(f_tweets, columns=['tweets'])
tweets_df

Unnamed: 0,tweets
0,this is introduction to nlp
1,it is likely to be useful to people
2,machine learning is the new electricity
3,there would be less hype around ai and more ac...
4,python is the best tool
5,r is a good language
6,i like this book
7,i want more books like this


## Tokenisieren

In [3]:
# ACHTUNG! Split berücksichtigt keine Satzzeichen

text = "This is the first sentence. This is the second one."  
list_of_words = text.split()  
list_of_words  

['This',
 'is',
 'the',
 'first',
 'sentence.',
 'This',
 'is',
 'the',
 'second',
 'one.']

In [5]:
# NLTK berücksichtigt Satzzeichen

from nltk.tokenize import word_tokenize 
import nltk
nltk.download('punkt_tab')

text="Hello there! Welcome to the programming world."
print(word_tokenize(text)) 

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


['Hello', 'there', '!', 'Welcome', 'to', 'the', 'programming', 'world', '.']


## Tokenisieren mit Spacy

In [24]:
import spacy 
nlp = spacy.load('de_core_news_sm')
#nlp = spacy.load("en_core_web_sm")

doc = nlp("Apple erwägt den Kauf eines österreichischen Startups um 6 Mio. Euro.") 
print(doc) 

Apple erwägt den Kauf eines österreichischen Startups um 6 Mio. Euro.


In [50]:
import spacy
text = "Apple erwägt den Kauf eines österreichischen Startups um 6 Mio. Euro."

nlp = spacy.load('de_core_news_sm')
doc = nlp(text)
doc

Apple erwägt den Kauf eines österreichischen Startups um 6 Mio. Euro.

In [56]:
for token in doc:
	print(f"{token.text:{20}}",type(token))

Apple                <class 'spacy.tokens.token.Token'>
erwägt               <class 'spacy.tokens.token.Token'>
den                  <class 'spacy.tokens.token.Token'>
Kauf                 <class 'spacy.tokens.token.Token'>
eines                <class 'spacy.tokens.token.Token'>
österreichischen     <class 'spacy.tokens.token.Token'>
Startups             <class 'spacy.tokens.token.Token'>
um                   <class 'spacy.tokens.token.Token'>
6                    <class 'spacy.tokens.token.Token'>
Mio.                 <class 'spacy.tokens.token.Token'>
Euro                 <class 'spacy.tokens.token.Token'>
.                    <class 'spacy.tokens.token.Token'>


## POS mit Spacy

In [54]:
import spacy
nlp = spacy.load("de_core_news_sm" )
words = nlp("Apple erwägt den Kauf eines österreichischen Startups um 6 Mio. Euro.")

for token in words:
	print(token.text, token.pos_) #.pos_ = Type (Verb, NOUN, NUM, ...)

Apple PROPN
erwägt VERB
den DET
Kauf NOUN
eines DET
österreichischen ADJ
Startups NOUN
um ADP
6 NUM
Mio. NOUN
Euro NOUN
. PUNCT


In [57]:
import spacy
from collections import Counter

nlp = spacy.load("de_core_news_sm")
text = "Apple erwägt den Kauf eines österreichischen Startups um 6 Mio. Euro."
doc = nlp(text)

# Häufigkeit der Wortklassen zählen
pos_counts = Counter([token.pos_ for token in doc])

print(pos_counts)

Counter({'NOUN': 4, 'DET': 2, 'PROPN': 1, 'VERB': 1, 'ADJ': 1, 'ADP': 1, 'NUM': 1, 'PUNCT': 1})


## Named Entity Recognition

In [75]:
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')
text = 'Apple is looking at buying U.K. startup for $1 billion''Apple is looking at buying U.K. startup for $1 billion'
doc = nlp(text)

for ent in doc.ents: 
    print(ent.text, ent.start_char, ent.end_char, ent.label_, 
          spacy.explain(ent.label_))
    
displacy.render(doc, style="ent")

Apple 0 5 ORG Companies, agencies, institutions, etc.
U.K. 27 31 GPE Countries, cities, states
1 45 46 MONEY Monetary values, including unit
U.K. 81 85 GPE Countries, cities, states
$1 billion 98 108 MONEY Monetary values, including unit


In [65]:
help(displacy)

Help on function render in module spacy.displacy:

render(docs: Union[Iterable[Union[spacy.tokens.doc.Doc, spacy.tokens.span.Span, dict]], spacy.tokens.doc.Doc, spacy.tokens.span.Span, dict], style: str = 'dep', page: bool = False, minify: bool = False, jupyter: Optional[bool] = None, options: Dict[str, Any] = {}, manual: bool = False) -> str
    Render displaCy visualisation.
    
    docs (Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]]): Document(s) to visualise.
        a 'dict' is only allowed here when 'manual' is set to True
    style (str): Visualisation style, 'dep' or 'ent'.
    page (bool): Render markup as full HTML page.
    minify (bool): Minify HTML markup.
    jupyter (bool): Override Jupyter auto-detection.
    options (dict): Visualiser-specific options, e.g. colors.
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
    RETURNS (str): Rendered SVG or HTML markup.
    
    DOCS: https://spacy.io/api/top-level#displacy.render
    US

In [76]:
for token in doc:         
    print(f'{token.text:10} {token.ent_iob_} {token.ent_type_}') 

Apple      B ORG
is         O 
looking    O 
at         O 
buying     O 
U.K.       B GPE
startup    O 
for        O 
$          O 
1          B MONEY
billionApple O 
is         O 
looking    O 
at         O 
buying     O 
U.K.       B GPE
startup    O 
for        O 
$          B MONEY
1          I MONEY
billion    I MONEY


In [71]:
doc = nlp("Der Ministerrat der Republik Österreich beschloss am 25.März 2014, eine „Unabhängige\nUntersuchungskommission zur transparenten Aufklärung der Vorkommnisse rund um die Hypo Group Alpe-Adria“ einzusetzen. Die Untersuchungskommission ([REDACTED], [REDACTED], [REDACTED], [REDACTED]) hat, beginnend mit 1.Mai 2014, durch Auswertung von beige-schafften Unterlagen und allgemein zugänglichen Quellen sowie durch Befragung von Auskunftspersonen den maßgeblichen Sachverhalt festgestellt und nach fachlichen Kriterien bewertet.")

new_text = []
for token in doc:
    if re.match(r"PER", token.ent_type_):
        new_text.append("[REDACTED]")
    else:
        new_text.append(token.text)

redacted_text = ""
for i in range(0, len(new_text)-1):
    if (i > 0) & (new_text[i-1] == "\n"):
        redacted_text += "\n"
    if not ((new_text[i] == new_text[i+1]) & (new_text[i] == "[REDACTED]")):
        if not (re.match("PUNCT|SPACE", doc[i].pos_) or (i == 0)):
            redacted_text += " "
    redacted_text += new_text[i]
if i == len(new_text)-2:
    redacted_text += new_text[i+1]

redacted_text

'[REDACTED] [REDACTED] der Republik Österreich beschloss am 25.März 2014, eine„ Unabhängige\n\n Untersuchungskommission zur transparenten Aufklärung der Vorkommnisse rund um die Hypo Group Alpe- Adria“ einzusetzen. Die Untersuchungskommission( [ REDACTED], [ REDACTED], [ REDACTED], [ REDACTED]) hat, beginnend mit 1.Mai 2014, durch[REDACTED][REDACTED][REDACTED][REDACTED] schafften Unterlagen und allgemein zugänglichen [REDACTED] sowie durch[REDACTED][REDACTED] [REDACTED] den maßgeblichen Sachverhalt festgestellt und nach fachlichen Kriterien bewertet.'