# Three investigators - part I.I

A project for scraping and analysing data from a fan site on the audio book called '[The three investigators](https://en.wikipedia.org/wiki/Three_Investigators#Germany)'

Part I.I: Topic modelling

Using the content and title for each episode to detect the overall topic.

In [2]:
#python version used for this project
from platform import python_version
print(python_version())

3.7.5


> need to add additional modules to requirement text file !!

In [108]:
# import modules [as specified in requirements.txt]
import pandas as pd
import numpy as np
import spacy
import nltk
import re

# for file 
import os

%matplotlib inline

In [3]:
#change directory to root folder
os.chdir("..")

## Load data

In [189]:
#load scraped datafiles
meta = pd.read_csv(".\\data\\scraped\\meta.csv")
content = pd.read_csv(".\\data\\scraped\\content_all.csv")

# make all column names lower case
df_list = [meta, content]

for df in df_list:
    df.columns = df.columns.str.lower()

## Title

### Standardising

In [190]:
## make titles lower case
title = meta["titel"].str.lower()

# replace values within titles

# function to loop through the column and replace substrings
def replace_values(text, dic):
    for x, y in dic.items():
        text = text.str.replace(x, y, regex=True)
    return text

# list of values to be replaced, including punctuation
replace_dict = {"hörspiel": "", 
                "[!\"#$%&()*+,./:;<=>?@[\]^_`{|}~“”-]": ""}

# apply function
title = replace_all(title, replace_dict)

# strip white space at the end
title = title.str.strip()

title

0             der superpapagei
1               der phantomsee
2             der karpatenhund
3           die schwarze katze
4         der fluch des rubins
                ...           
200             das weiße grab
201    tauchgang ins ungewisse
202         der dunkle wächter
203       das rätselhafte erbe
204         und der mottenmann
Name: titel, Length: 205, dtype: object

### remove stopwords

In [179]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lisa.hornung\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [180]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('german'))
print(sorted(stop_words))

['aber', 'alle', 'allem', 'allen', 'aller', 'alles', 'als', 'also', 'am', 'an', 'ander', 'andere', 'anderem', 'anderen', 'anderer', 'anderes', 'anderm', 'andern', 'anderr', 'anders', 'auch', 'auf', 'aus', 'bei', 'bin', 'bis', 'bist', 'da', 'damit', 'dann', 'das', 'dass', 'dasselbe', 'dazu', 'daß', 'dein', 'deine', 'deinem', 'deinen', 'deiner', 'deines', 'dem', 'demselben', 'den', 'denn', 'denselben', 'der', 'derer', 'derselbe', 'derselben', 'des', 'desselben', 'dessen', 'dich', 'die', 'dies', 'diese', 'dieselbe', 'dieselben', 'diesem', 'diesen', 'dieser', 'dieses', 'dir', 'doch', 'dort', 'du', 'durch', 'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'einig', 'einige', 'einigem', 'einigen', 'einiger', 'einiges', 'einmal', 'er', 'es', 'etwas', 'euch', 'euer', 'eure', 'eurem', 'euren', 'eurer', 'eures', 'für', 'gegen', 'gewesen', 'hab', 'habe', 'haben', 'hat', 'hatte', 'hatten', 'hier', 'hin', 'hinter', 'ich', 'ihm', 'ihn', 'ihnen', 'ihr', 'ihre', 'ihrem', 'ihren', 'ihrer', 'ihres', 'i

In [188]:
title

['der superpapagei',
 'der phantomsee',
 'der karpatenhund',
 'die schwarze katze',
 'der fluch des rubins',
 'der sprechende totenkopf',
 'der unheimliche drache',
 'der grüne geist',
 'die rätselhaften bilder',
 'die flüsternde mumie',
 'das gespensterschloß',
 'der seltsame wecker',
 'der lachende schatten',
 'das bergmonster',
 'der rasende löwe',
 'der zauberspiegel',
 'die gefährliche erbschaft',
 'die geisterinsel',
 'der teufelsberg',
 'die flammende spur',
 'der tanzende teufel',
 'der verschwundene schatz',
 'das aztekenschwert',
 'die silberne spinne',
 'die singende schlange',
 'die silbermine',
 'der magische kreis',
 'der doppelgänger',
 'das riff der haie',
 'das narbengesicht',
 'der ameisenmensch',
 'die bedrohte ranch',
 'der rote pirat',
 'der höhlenmensch',
 'der superwal',
 'der heimliche hehler',
 'der unsichtbare gegner',
 'die perlenvögel',
 'der automarder',
 'das volk der winde',
 'der weinende sarg',
 'der höllische werwolf',
 'der gestohlene preis',
 'das go

In [187]:
title_no_stop_words = []

for word in title:
    if word not in stop_words:
        title_no_stop_words.append(word)
        
title_no_stop_words

['der superpapagei',
 'der phantomsee',
 'der karpatenhund',
 'die schwarze katze',
 'der fluch des rubins',
 'der sprechende totenkopf',
 'der unheimliche drache',
 'der grüne geist',
 'die rätselhaften bilder',
 'die flüsternde mumie',
 'das gespensterschloß',
 'der seltsame wecker',
 'der lachende schatten',
 'das bergmonster',
 'der rasende löwe',
 'der zauberspiegel',
 'die gefährliche erbschaft',
 'die geisterinsel',
 'der teufelsberg',
 'die flammende spur',
 'der tanzende teufel',
 'der verschwundene schatz',
 'das aztekenschwert',
 'die silberne spinne',
 'die singende schlange',
 'die silbermine',
 'der magische kreis',
 'der doppelgänger',
 'das riff der haie',
 'das narbengesicht',
 'der ameisenmensch',
 'die bedrohte ranch',
 'der rote pirat',
 'der höhlenmensch',
 'der superwal',
 'der heimliche hehler',
 'der unsichtbare gegner',
 'die perlenvögel',
 'der automarder',
 'das volk der winde',
 'der weinende sarg',
 'der höllische werwolf',
 'der gestohlene preis',
 'das go

## Additional code

In [None]:
corpus_numbers = [re.sub(r"ninety-six", "96", word) for word in corpus_words]   # Defines a new variable create by substituting
                                                                                # '96' for 'ninety-six' in corpus_words

print(corpus_numbers[:100])                                            # Prints the first 100 items in the newly created corpus

In [None]:
def multiple_replace(dict, text):
  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) 

if __name__ == "__main__": 
    dict = {
    "CA" : "California",
    "United Kingdom" : "U.K.",
    "United Kingdom of Great Britain and Northern Ireland" : "U.K.",
    "United Kingdom of Great Britain" : "U.K.",
    "UK" : "U.K.",
    "Privacy Policy" : "noodle soup",
  } 

corpus_replace = multiple_replace(dict, corpus)
print(corpus_replace)