In [None]:
import requests
import json
import pandas as pd
import re
import time
import os
import random
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import nltk
from nltk.tokenize import word_tokenize
import csv

In [None]:
# Set up necessary directories and configurations:
os.makedirs('data', exist_ok=True)
session = requests.Session()
retries = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))
nltk.download('punkt')

# Clean title by standardizing the "By H. P. Lovecraft" text:
def clean_title(title):
    author_text = "By H. P. Lovecraft"
    title = re.sub(rf"({author_text}\s*)+", author_text, title).strip()
    if title.endswith(author_text) and not title.endswith(" " + author_text):
        title = title.replace(author_text, " " + author_text)
    return title

# --- Step 1: Scraping Lovecraft Fiction Works ---

def scrape_lovecraft_content():
    base_url = "https://www.hplovecraft.com/writings/texts/"
    response = session.get(base_url)
    
    if response.status_code != 200:
        print(f"Failed to access the base URL: {response.status_code}")
        return
    
    soup = BeautifulSoup(response.content, 'html.parser')
    content_links = [
        f"{base_url}{link['href']}"
        for link in soup.find_all('a', href=True)
        if link['href'].startswith('fiction/') and not link['href'].startswith('#')
    ]

    csv_filename = 'data/lovecraft_fiction.csv'
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Content Type', 'Title', 'Text'])

        for content_url in content_links:
            time.sleep(random.uniform(1, 3))
            try:
                content_response = session.get(content_url, headers={'User-Agent': 'Mozilla/5.0'})
                if content_response.status_code == 200:
                    content_soup = BeautifulSoup(content_response.content, 'html.parser')
                    title_tag = content_soup.find('font', size="+2")
                    author_tag = content_soup.find('font', size="+1")
                    text_div = content_soup.find('div', align='justify')

                    if title_tag and text_div:
                        title = f"{title_tag.get_text(strip=True)} by {author_tag.get_text(strip=True)}"
                        title = clean_title(title)  # Clean the title text
                        csvwriter.writerow(["fiction", title, text_div.get_text(strip=True)])
                        print(f'Scraped: {title}')
                    else:
                        print(f'Title or text not found for {content_url}')
                else:
                    print(f'Failed to scrape {content_url}: {content_response.status_code}')
            except Exception as e:
                print(f'Error scraping {content_url}: {e}')

# Scrape only fiction content:
scrape_lovecraft_content()


In [None]:
# Load the scraped CSV data into a Pandas DataFrame
df = pd.read_csv('data/lovecraft_fiction.csv')

# View the first few rows of the dataframe to verify
print(df.head())


In [None]:
from nltk.corpus import stopwords

# Ensure stopwords are downloaded
nltk.download('stopwords')

# Tokenize and remove stopwords
def clean_text(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    cleaned_tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stop_words]
    return " ".join(cleaned_tokens)

# Apply to the text column in the dataframe
df['Cleaned_Text'] = df['Text'].apply(clean_text)


In [None]:
from collections import Counter

# Create a frequency distribution for words in all texts
all_words = " ".join(df['Cleaned_Text']).split()
word_freq = Counter(all_words)

# Display the most common words
print(word_freq.most_common(10))


In [None]:
import spacy
from collections import Counter

# Load the spaCy model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")

# Function to extract entities
def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text.lower() for ent in doc.ents if ent.label_ in ['PERSON', 'ORG', 'GPE', 'LOC']]
    return entities

# Apply the extraction to the 'Cleaned_Text' column
df['Entities'] = df['Cleaned_Text'].apply(extract_entities)

# Flatten the list of entities and get their frequency count
all_entities = [entity for sublist in df['Entities'] for entity in sublist]
entity_freq = Counter(all_entities)

# Display the most common entities
print(entity_freq.most_common(100))


In [None]:
import spacy
import re
from collections import Counter

# Load the spaCy model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")

# Expanded list of Lovecraftian and related entities (including new ones)
lovecraft_entities_expanded = [
    "cthulhu", "yog-sothoth", "nyarlathotep", "azathoth", "hastur", "r'lyeh", "dagon", 
    "shub-niggurath", "the great old ones", "elder gods", "the old ones", "the deep ones", "night gaunts", 
    "cthulhu cult", "the nameless city", "the black stone", "the dreamlands", "fenric", "hecuba", 
    "animus", "tor-gasukk", "moloch", "kai'lizakia", "lloigor", "eidolon", "derleth", "gog", "magog", "to'koth", 
    "karnas'koi", "traguam", "archon", "mi'en kalarash", "kwundaar", "volund", "k'thun", "noth-yidik", "tru'nembra", 
    "tulzscha", "cxaxukluth", "d'endrrah", "ubbo-sathla", "xexanoth", "ycnàgnnisssz", "yhoundeh", "aiueb gnshal", 
    "aletheia", "azhorra-tha", "c'thalpa", "daoloth", "ghroth", "gi-hoveg", "haiogh-yai", "huitloxopetl", "ialdagorth", 
    "kaajh'kaalbh", "kaalut", "lu-kthu", "mh'ithrha", "mlandoth", "mril thorion", "mother of pus", "nhimbaloth", 
    "ngyr-khorath", "nyctelios", "olkoth", "ramasekva", "shabbith-ka", "star mother", "suc'naath", "uvhash", "xa'ligha", 
    "yibb-tstll", "yidhra", "yomag'n'tho"
]

# Function to extract entities
def extract_entities(text):
    doc = nlp(text)
    
    # Initialize the list to store extracted entities
    entities = []
    
    # Extract the named entities using spaCy's NER (PERSON, ORG, GPE, LOC)
    for ent in doc.ents:
        if ent.label_ in ['PERSON', 'ORG', 'GPE', 'LOC']:
            entities.append(ent.text.lower())
    
    # Manually add expanded entities and regex pattern matches for indirect references
    for entity in lovecraft_entities_expanded:
        # Check for direct entity mentions (singular and plural)
        singular_entity = r'\b' + re.escape(entity) + r'\b'
        plural_entity = r'\b' + re.escape(entity + "s") + r'\b'  # Handle plural form

        if re.search(singular_entity, text.lower()) or re.search(plural_entity, text.lower()):
            entities.append(entity)
        
        # Check for indirect references or variations (e.g., "great old ones" or "eldritch horror")
        indirect_references = [
            r"\bdeep ones\b", r"\bcosmic entity\b", r"\bhorrible being\b", r"\bnight gaunts\b", r"\bblack stone\b",
            r"\byog sothoth\b", r"\bnamesless city\b", r"\bstrange entity\b", r"\botherworldly creature\b", r"\bdark god\b",
            r"\bhorrible power\b", r"\btimeless one\b"
        ]
        for pattern in indirect_references:
            if re.search(pattern, text.lower()):
                entities.append(entity)
    
    return entities

# Apply the extraction to the 'Cleaned_Text' column (assuming 'Cleaned_Text' contains the content)
df['Entities'] = df['Cleaned_Text'].apply(extract_entities)

# Flatten the list of entities and get their frequency count
all_entities = [entity for sublist in df['Entities'] for entity in sublist]
entity_freq = Counter(all_entities)

# Display the most common entities
print(entity_freq.most_common(100))

# Custom list of Lovecraft entities to track (including new entities)
specific_entity_freq = {entity: all_entities.count(entity) for entity in lovecraft_entities_expanded}

# Display the count of these specific entities
print(specific_entity_freq)


In [None]:
import spacy
import re
from collections import Counter

# Load the spaCy model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")

# Expanded list of Lovecraftian and related entities (including new ones)
lovecraft_entities_expanded = [
    "cthulhu", "yog-sothoth", "nyarlathotep", "azathoth", "hastur", "r'lyeh", "dagon", 
    "shub-niggurath", "the great old ones", "elder gods", "the old ones", "the deep ones", "night gaunts", 
    "cthulhu cult", "the nameless city", "the black stone", "the dreamlands", "fenric", "hecuba", 
    "animus", "tor-gasukk", "moloch", "kai'lizakia", "lloigor", "eidolon", "derleth", "gog", "magog", "to'koth", 
    "karnas'koi", "traguam", "archon", "mi'en kalarash", "kwundaar", "volund", "k'thun", "noth-yidik", "tru'nembra", 
    "tulzscha", "cxaxukluth", "d'endrrah", "ubbo-sathla", "xexanoth", "ycnàgnnisssz", "yhoundeh", "aiueb gnshal", 
    "aletheia", "azhorra-tha", "c'thalpa", "daoloth", "ghroth", "gi-hoveg", "haiogh-yai", "huitloxopetl", "ialdagorth", 
    "kaajh'kaalbh", "kaalut", "lu-kthu", "mh'ithrha", "mlandoth", "mril thorion", "mother of pus", "nhimbaloth", 
    "ngyr-khorath", "nyctelios", "olkoth", "ramasekva", "shabbith-ka", "star mother", "suc'naath", "uvhash", "xa'ligha", 
    "yibb-tstll", "yidhra", "yomag'n'tho"
]

# Function to extract entities
def extract_entities(text):
    doc = nlp(text)
    
    # Initialize the list to store extracted entities
    entities = []
    
    # Manually add expanded entities and regex pattern matches for direct references
    for entity in lovecraft_entities_expanded:
        # Check for direct entity mentions (singular and plural)
        singular_entity = r'\b' + re.escape(entity) + r'\b'
        plural_entity = r'\b' + re.escape(entity + "s") + r'\b'  # Handle plural form

        if re.search(singular_entity, text.lower()) or re.search(plural_entity, text.lower()):
            entities.append(entity)
        
        # Check for indirect references or variations (e.g., "great old ones" or "eldritch horror")
        indirect_references = [
            r"\bdeep ones\b", r"\bcosmic entity\b", r"\bhorrible being\b", r"\bnight gaunts\b", r"\bblack stone\b",
            r"\byog sothoth\b", r"\bnamesless city\b", r"\bstrange entity\b", r"\botherworldly creature\b", r"\bdark god\b",
            r"\bhorrible power\b", r"\btimeless one\b"
        ]
        for pattern in indirect_references:
            if re.search(pattern, text.lower()):
                entities.append(entity)
    
    return entities

# Apply the extraction to the 'Cleaned_Text' column (assuming 'Cleaned_Text' contains the content)
df['Entities'] = df['Cleaned_Text'].apply(extract_entities)

# Flatten the list of entities and get their frequency count
all_entities = [entity for sublist in df['Entities'] for entity in sublist]
entity_freq = Counter(all_entities)

# Display the most common entities
print(entity_freq.most_common(100))

# Custom list of Lovecraft entities to track (including new entities)
specific_entity_freq = {entity: all_entities.count(entity) for entity in lovecraft_entities_expanded}

# Display the count of these specific entities
print(specific_entity_freq)


In [None]:
# List of entities with their mentions
entities = [
     ('cthulhu', 21), ('nyarlathotep', 20), ('azathoth', 17), ('eidolon', 17), ('elder gods', 16), ('dagon', 15), 
     ('cthulhu cult', 14), ('hastur', 13), ('moloch', 13), ('derleth', 13), ('archon', 13), ('yog-sothoth', 12), 
     ("r'lyeh", 12), ('shub-niggurath', 12), ('the great old ones', 12), ('the old ones', 12), ('the deep ones', 12), 
     ('night gaunts', 12), ('the nameless city', 12), ('the black stone', 12), ('the dreamlands', 12), ('fenric', 12), 
     ('hecuba', 12), ('animus', 12), ('tor-gasukk', 12), ("kai'lizakia", 12), ('lloigor', 12), ('gog', 12), ('magog', 12), 
     ("to'koth", 12), ("karnas'koi", 12), ('traguam', 12), ("mi'en kalarash", 12), ('kwundaar', 12), ('volund', 12), 
     ("k'thun", 12), ('noth-yidik', 12), ("tru'nembra", 12), ('tulzscha', 12), ('cxaxukluth', 12), ("d'endrrah", 12), 
     ('ubbo-sathla', 12), ('xexanoth', 12), ('ycnàgnnisssz', 12), ('yhoundeh', 12), ('aiueb gnshal', 12), ('aletheia', 12), 
     ('azhorra-tha', 12), ("c'thalpa", 12), ('daoloth', 12), ('ghroth', 12), ('gi-hoveg', 12), ('haiogh-yai', 12), 
     ('huitloxopetl', 12), ('ialdagorth', 12), ("kaajh'kaalbh", 12), ('kaalut', 12), ('lu-kthu', 12), ("mh'ithrha", 12), 
     ('mlandoth', 12), ('mril thorion', 12), ('mother of pus', 12), ('nhimbaloth', 12), ('ngyr-khorath', 12), ('nyctelios', 12), 
     ('olkoth', 12), ('ramasekva', 12), ('shabbith-ka', 12), ('star mother', 12), ("suc'naath", 12), ('uvhash', 12), 
     ("xa'ligha", 12), ('yibb-tstll', 12), ('yidhra', 12), ("yomag'n'tho", 12)
]

# Function to filter and categorize entities
def filter_lovecraft_entities(entities, exclude_humans=True):
    # List of human names to be excluded
    human_names = ['carter', 'willett', 'joseph curwen', 'johnny', 'ammi', 'denis', 
                   'wilbur', 'ben', 'dobson', 'george campbell', 'fed', 'van der', 'steve', 
                   'dalton', 'joe slater', 'nahum', 'davis', 'wilbur whateley', 'joe', 
                   'john', 'nova', 'romero', 'robert grandison', 'joe mazurewicz', 'warren', 
                   'joe', 'arthur jermyn', 'jermyn house', 'joe', 'arthur munroe', 'james dalton', 
                   'anderson', 'robert suydam', 'herbert west', 'sefton', 'matt', 'miller', 
                   'arthur wheeler', 'robert', 'henry akeley', 'norman']
    
    # Create a dictionary to store entities by category
    categories = {
        'Cthulhu Mythos': [],
        'Locations & Settings': [],
        'Cosmic Entities': [],
        'Occult Entities': [],
        'Mythos-Related Concepts': []
    }
    
    # Process each entity in the list
    for entity, mentions in entities:
        # Exclude human characters
        if exclude_humans and entity.lower() in human_names:
            categories['Humanoid Names'].append((entity, mentions))
            continue
        
        # Categorize entities based on their known groupings
        if entity in ['cthulhu', 'great cthulhu', 'nyarlathotep', 'azathoth', 'shub-niggurath', 'dagon', 'yog-sothoth']:
            categories['Cthulhu Mythos'].append((entity, mentions))
        elif entity in ['arkham', 'miskatonic', 'innsmouth', 'dunwich', 'r\'lyeh', 'the dreamlands', 'the nameless city']:
            categories['Locations & Settings'].append((entity, mentions))
        elif entity in ['elder gods', 'the old ones', 'the great old ones', 'night gaunts', 'the deep ones', 'colour out of space', 'yog sothoth']:
            categories['Cosmic Entities'].append((entity, mentions))
        elif entity in ['toymakers', 'guardians of time', 'the great intelligence', 'moloch', 'hecuba', 'animus', 'archon']:
            categories['Occult Entities'].append((entity, mentions))
        else:
            categories['Mythos-Related Concepts'].append((entity, mentions))
    
    return categories

# Apply the filter
filtered_entities = filter_lovecraft_entities(entities)

# Print the result
for category, entities_list in filtered_entities.items():
    print(f"--- {category} ---")
    for entity, mentions in entities_list:
        print(f"{entity}: {mentions}")
