In [None]:
import requests
import json
import pandas as pd
import re
import time
import os
import random
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import nltk
from nltk.tokenize import word_tokenize
import csv

In [None]:
import os
import re
import csv
import time
import random
import requests
import nltk
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Set up necessary directories and configurations:
os.makedirs('data', exist_ok=True)
session = requests.Session()
retries = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))
nltk.download('punkt')

# Clean title by removing author names:
def clean_title(title):
    # Remove any mention of "by" or "with" along with author names (e.g., "by H. P. Lovecraft")
    title = re.sub(r'(by\s*H\.?\s*P\.?\s*Lovecraft|with\s*H\.?\s*P\.?\s*Lovecraft)', '', title, flags=re.IGNORECASE).strip()

    # Remove any other remaining author mentions (like "By C. M. Eddy, Jr." etc.)
    title = re.sub(r'(by\s+[a-zA-Z\.\,]+)', '', title, flags=re.IGNORECASE).strip()

    # Remove extra spaces or redundant punctuation
    title = ' '.join(title.split())
    
    return title

# Scraping Lovecraft Fiction:
def scrape_lovecraft_content():
    base_url = "https://www.hplovecraft.com/writings/texts/"
    response = session.get(base_url)
    
    if response.status_code != 200:
        print(f"Failed to access the base URL: {response.status_code}")
        return
    
    soup = BeautifulSoup(response.content, 'html.parser')
    content_links = [
        f"{base_url}{link['href']}"
        for link in soup.find_all('a', href=True)
        if link['href'].startswith('fiction/') and not link['href'].startswith('#')
    ]

    csv_filename = 'data/lovecraft_fiction.csv'
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Content Type', 'Title', 'Text'])

        for content_url in content_links:
            time.sleep(random.uniform(1, 3))
            try:
                content_response = session.get(content_url, headers={'User-Agent': 'Mozilla/5.0'})
                if content_response.status_code == 200:
                    content_soup = BeautifulSoup(content_response.content, 'html.parser')
                    title_tag = content_soup.find('font', size="+2")
                    text_div = content_soup.find('div', align='justify')

                    if title_tag and text_div:
                        title = title_tag.get_text(strip=True)
                        title = clean_title(title)  # Clean the title to remove authors
                        csvwriter.writerow(["fiction", title, text_div.get_text(strip=True)])
                        print(f'Scraped: {title}')
                    else:
                        print(f'Title or text not found for {content_url}')
                else:
                    print(f'Failed to scrape {content_url}: {content_response.status_code}')
            except Exception as e:
                print(f'Error scraping {content_url}: {e}')

scrape_lovecraft_content()


In [None]:
import pandas as pd
from textblob import TextBlob

# Load scraped CSV data into DF:
df = pd.read_csv('data/lovecraft_fiction.csv')

# View first few rows
print(df.head())

# Count the number of words per story (excluding commas and other punctuation)
df['Word Count'] = df['Text'].apply(lambda text: len([word for word in text.split() if word.isalpha()]))

# View the updated dataframe with word counts
print(df[['Title', 'Word Count']].head())

# 1. Calculate Text Length (Character Counts)
df['Text Length'] = df['Text'].apply(len)

# 2. Perform Sentiment Analysis (polarity)
def get_sentiment(text):
    # Create a TextBlob object
    blob = TextBlob(text)
    # Return the sentiment polarity (-1 to 1)
    return blob.sentiment.polarity

# Apply sentiment analysis to each story
df['Sentiment'] = df['Text'].apply(get_sentiment)

# View the dataframe with Text Length and Sentiment columns
print(df[['Title', 'Text Length', 'Sentiment']].head())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming your df has the following columns: 'Word Count', 'Sentiment', and 'Text'
# If not, make sure to adjust as per your dataset

# Set up the figure for all plots to be displayed in one cell
plt.figure(figsize=(18, 6))

# Plot 1: Average Word Count Distribution
plt.subplot(1, 3, 1)
sns.boxplot(x=df['Word Count'])
plt.axvline(df['Word Count'].mean(), color='red', linestyle='--', label=f'Average: {df["Word Count"].mean():.2f}')
plt.title('Distribution of Word Counts in Lovecraft’s Fiction')
plt.xlabel('Word Count')
plt.legend()

# Plot 2: Sentiment Score Distribution
plt.subplot(1, 3, 2)
sns.histplot(df['Sentiment'], kde=True, color='purple', bins=20)
plt.title('Distribution of Sentiment Scores')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')

# Plot 3: Sentiment vs. Word Count (Bivariate plot)
plt.subplot(1, 3, 3)
sns.scatterplot(x='Word Count', y='Sentiment', data=df, color='orange')
plt.title('Sentiment Score vs. Word Count')
plt.xlabel('Word Count')
plt.ylabel('Sentiment Score')

# Show all plots together
plt.tight_layout()
plt.show()

# Analysis

# 1. **Average Word Count Distribution**:
# The first plot shows the distribution of word counts in the dataset. The red dashed line indicates the average word count.
# From this plot, you can observe whether most stories are around the average word count or if there are any extreme values. 
# For example, if there are many outliers, it suggests that some stories are much longer or shorter than the rest.
# This information is useful for understanding the typical length of Lovecraft’s stories and spotting anomalies.

# 2. **Sentiment Score Distribution**:
# The second plot presents the distribution of sentiment scores. The use of a KDE (Kernel Density Estimate) gives us a smooth curve to visualize the overall sentiment.
# If the sentiment score is skewed toward positive or negative values, it could indicate that most stories have a certain mood (e.g., negative or neutral).
# This visualization is helpful to understand the general sentiment across all stories. If there's a sharp peak at one end, it suggests a dominant tone in the dataset.

# 3. **Sentiment vs. Word Count**:
# The scatter plot shows how sentiment and word count are related. It’s important to see if longer stories tend to have a more positive or negative sentiment. 
# For instance, if longer stories cluster towards positive sentiment, it might suggest that Lovecraft's longer works have a more optimistic tone, or vice versa.
# This bivariate analysis is useful for identifying trends or correlations between the length of stories and their emotional tone.


In [None]:
from scipy.stats import pearsonr

# Compute Pearson correlation between Word Count and Sentiment Score
correlation, p_value = pearsonr(df['Word Count'], df['Sentiment'])
print(f"Pearson's correlation coefficient: {correlation:.2f}")
print(f"P-value: {p_value:.4f}")

# Interpret the p-value
if p_value < 0.05:
    print("The correlation is statistically significant.")
else:
    print("The correlation is not statistically significant.")


In [None]:
from nltk.corpus import stopwords

# Download stopwords:
nltk.download('stopwords')

# Tokenize and remove stopwords:
def clean_text(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    cleaned_tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stop_words]
    return " ".join(cleaned_tokens)

# Apply to text column in DF:
df['Cleaned_Text'] = df['Text'].apply(clean_text)


In [None]:
from collections import Counter

# Create frequency distribution for words:
all_words = " ".join(df['Cleaned_Text']).split()
word_freq = Counter(all_words)

# Display most common words:
print(word_freq.most_common(100))


In [None]:
'''import spacy
from collections import Counter

# Load spaCy model:
nlp = spacy.load("en_core_web_sm")

# Function to extract entities:
def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text.lower() for ent in doc.ents if ent.label_ in ['PERSON', 'LOC']]
    return entities

# Apply the extraction to the 'Cleaned_Text' column
df['Entities'] = df['Cleaned_Text'].apply(extract_entities)

# Flatten the list of entities and get their frequency count
all_entities = [entity for sublist in df['Entities'] for entity in sublist]
entity_freq = Counter(all_entities)

# Display the most common entities
print(entity_freq.most_common(100))
'''

In [None]:
import spacy
import re
from collections import Counter

# Load spaCy model for Entity Recognition:
nlp = spacy.load("en_core_web_sm")

# List of Lovecraftian entities:
lovecraftian_entities = [
    "cthulhu", "yog-sothoth", "nyarlathotep", "azathoth", "hastur", "r'lyeh", "dagon", 
    "shub-niggurath", "the great old ones", "elder gods", "the old ones", "the deep ones", "night gaunts", 
    "cthulhu cult", "the nameless city", "the black stone", "the dreamlands", "fenric", "hecuba", 
    "animus", "tor-gasukk", "moloch", "kai'lizakia", "lloigor", "eidolon", "derleth", "gog", "magog", "to'koth", 
    "karnas'koi", "traguam", "archon", "mi'en kalarash", "kwundaar", "volund", "k'thun", "noth-yidik", "tru'nembra", 
    "tulzscha", "cxaxukluth", "d'endrrah", "ubbo-sathla", "xexanoth", "ycnàgnnisssz", "yhoundeh", "aiueb gnshal", 
    "aletheia", "azhorra-tha", "c'thalpa", "daoloth", "ghroth", "gi-hoveg", "haiogh-yai", "huitloxopetl", "ialdagorth", 
    "kaajh'kaalbh", "kaalut", "lu-kthu", "mh'ithrha", "mlandoth", "mril thorion", "mother of pus", "nhimbaloth", 
    "ngyr-khorath", "nyctelios", "olkoth", "ramasekva", "shabbith-ka", "star mother", "suc'naath", "uvhash", "xa'ligha", 
    "yibb-tstll", "yidhra", "yomag'n'tho"
]

# Indirect references or variations (these should be generalized and defined outside the loop for efficiency)
indirect_references = [
    r"\bdeep ones\b", r"\bcosmic entity\b", r"\bhorrible being\b", r"\bnight gaunts\b", r"\bblack stone\b",
    r"\byog sothoth\b", r"\bnamesless city\b", r"\bstrange entity\b", r"\botherworldly creature\b", r"\bdark god\b",
    r"\bhorrible power\b", r"\btimeless one\b"
]

# Function to extract entities
def extract_entities(text):
    doc = nlp(text)
    
    # Initialize the list to store extracted entities
    entities = []

    # Manually add expanded entities and regex pattern matches for direct references
    for entity in lovecraftian_entities:
        # Check for direct entity mentions (singular and plural)
        singular_entity = r'\b' + re.escape(entity) + r'\b'
        plural_entity = r'\b' + re.escape(entity + "s") + r'\b'  # Handle plural form

        if re.search(singular_entity, text.lower()) or re.search(plural_entity, text.lower()):
            entities.append(entity)
        
        # Check for indirect references or variations
        for pattern in indirect_references:
            if re.search(pattern, text.lower()):
                entities.append(entity)

    return entities

# Apply the extraction to the 'Cleaned_Text' column (assuming 'Cleaned_Text' contains the content)
df['Entities'] = df['Cleaned_Text'].apply(extract_entities)

# Flatten the list of entities and get their frequency count
all_entities = [entity for sublist in df['Entities'] for entity in sublist]
entity_freq = Counter(all_entities)

# Display the most common entities
print(entity_freq.most_common(100))

# Custom list of Lovecraftian entities to track (including new entities)
specific_entity_freq = {entity: all_entities.count(entity) for entity in lovecraftian_entities}

# Display the count of these specific entities
print(specific_entity_freq)


In [None]:
from collections import Counter

# Count the mentions of each entity across all rows in 'Entities' column
all_entities = [entity for sublist in df['Entities'] for entity in sublist]
entity_mentions = Counter(all_entities)

# Convert the Counter object to a list of tuples (entity, mentions)
entities = [(entity, count) for entity, count in entity_mentions.items()]

# Function to filter and categorize entities:
def filter_lovecraft_entities(entities, exclude_humans=True):

    # Dictionary to store entities by category:
    categories = {
        'Cthulhu Mythos': [],
        'Locations & Settings': [],
        'Cosmic Entities': [],
        'Occult Entities': [],
        'Mythos-Related Concepts': []
    }

    # Known classifications:
    mythos_deities = {'cthulhu', 'great cthulhu', 'nyarlathotep', 'azathoth', 'shub-niggurath', 
                      'dagon', 'yog-sothoth', 'hastur', 'ubbo-sathla', 'ghroth', 'ycnàgnnisssz'}
    
    locations = {'arkham', 'miskatonic', 'innsmouth', 'dunwich', "r'lyeh", 'the dreamlands', 'the nameless city'}
    
    cosmic_entities = {'elder gods', 'the old ones', 'the great old ones', 'night gaunts', 'the deep ones', 
                       'colour out of space', 'yog sothoth', 'tulzscha', 'cxaxukluth', 'yhoundeh', 
                       'aiueb gnshal', 'aletheia', 'azhorra-tha', "c'thalpa", 'daoloth', 'ghroth', 'gi-hoveg', 
                       'haiogh-yai', 'huitloxopetl', 'ialdagorth', "kaajh'kaalbh", 'kaalut', 'lu-kthu', 
                       "mh'ithrha", 'mlandoth', 'mril thorion', 'mother of pus', 'nhimbaloth', 
                       'ngyr-khorath', 'nyctelios', 'olkoth', 'ramasekva', 'shabbith-ka', 'star mother', 
                       "suc'naath", 'uvhash', "xa'ligha", 'yibb-tstll', 'yidhra', "yomag'n'tho"}
    
    occult_entities = {'toymakers', 'guardians of time', 'the great intelligence', 'moloch', 
                       'hecuba', 'animus', 'archon', 'kwundaar', 'volund', 'noth-yidik', 'tru\'nembra'}

    # Categorize entities:
    for entity, mentions in entities:

        if entity in mythos_deities:
            categories['Cthulhu Mythos'].append((entity, mentions))
        elif entity in locations:
            categories['Locations & Settings'].append((entity, mentions))
        elif entity in cosmic_entities:
            categories['Cosmic Entities'].append((entity, mentions))
        elif entity in occult_entities:
            categories['Occult Entities'].append((entity, mentions))
        else:
            categories['Mythos-Related Concepts'].append((entity, mentions))

    return categories

# Apply filter:
filtered_entities = filter_lovecraft_entities(entities)

# Print result:
for category, entities_list in filtered_entities.items():
    print(f"--- {category} ---")
    for entity, mentions in entities_list:
        print(f"{entity}: {mentions}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Count mentions of each entity across all rows in Entities column:
all_entities = [entity for sublist in df['Entities'] for entity in sublist]
entity_mentions = Counter(all_entities)

# Convert Counter object to DF (top 10):
entity_df = pd.DataFrame(entity_mentions.most_common(10), columns=['Entity', 'Frequency'])

# Plot top 10 entities:
plt.figure(figsize=(10, 6))
sns.barplot(x='Frequency', y='Entity', data=entity_df, palette='viridis', hue='Entity')
plt.title('Top 10 Lovecraftian Entities')
plt.xlabel('Frequency of Mentions')
plt.ylabel('Entity')
plt.show()


In [None]:
"""# Save entity frequencies to CSV:
entity_df.to_csv('data/entity_frequencies.csv', index=False)
"""

In [None]:
import seaborn as sns

# Plot the most common words (word frequency analysis)
top_words = word_freq.most_common(20)
words, counts = zip(*top_words)
plt.figure(figsize=(10, 6))
sns.barplot(x=list(words), y=list(counts))
plt.xticks(rotation=90)
plt.title('Top 20 Most Common Words')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.show()

# Plot the most common entities
top_entities = entity_freq.most_common(20)
entities, entity_counts = zip(*top_entities)
plt.figure(figsize=(10, 6))
sns.barplot(x=list(entities), y=list(entity_counts))
plt.xticks(rotation=90)
plt.title('Top 20 Most Common Lovecraftian Entities')
plt.xlabel('Entities')
plt.ylabel('Frequency')
plt.show()


In [None]:
print(df.columns)


In [None]:
import pandas as pd
import numpy as np

# Beispiel-Datensatz erstellen
data = {
    'Title': ['The Call of Cthulhu', 'At the Mountains of Madness', 'The Dunwich Horror', 'The Shadow over Innsmouth'],
    'Word_Count': [27000, 41000, 17500, 27000],
    'Entities': [['Cthulhu', 'Rlyeh'], ['Elder Things', 'Shoggoths'], ['Yog-Sothoth', 'Whateley'], ['Deep Ones', 'Innsmouth']],
    'Genre': ['Horror', 'Horror', 'Horror', 'Horror']
}

df = pd.DataFrame(data)

# 1. Verwendung von lambda und apply: Neue Spalten basierend auf bestehenden Spalten
# Erstellen einer Spalte mit der Anzahl der Entities

df['Entity_Count'] = df['Entities'].apply(lambda x: len(x))

# 2. Groupby und Aggregation
# Durchschnittliche Wortanzahl pro Genre
avg_word_count = df.groupby('Genre')['Word_Count'].mean()

# Gesamtanzahl der Entities pro Genre
total_entities = df.groupby('Genre')['Entity_Count'].sum()

# 3. Erstellung einer Häufigkeitstabelle
# Zählen, wie oft eine bestimmte Entity auftaucht
entity_list = [entity for sublist in df['Entities'] for entity in sublist]
entity_freq = pd.Series(entity_list).value_counts()

# Ergebnisse ausgeben
print("\nDataFrame mit neuer Spalte Entity_Count:")
print(df)
print("\nDurchschnittliche Wortanzahl pro Genre:")
print(avg_word_count)
print("\nGesamtanzahl der Entities pro Genre:")
print(total_entities)
print("\nHäufigkeitstabelle der Entities:")
print(entity_freq)