In [None]:
import requests
import json
import pandas as pd
import re
import time
import os
import random
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import nltk
from nltk.tokenize import word_tokenize
import csv

In [None]:
'''# Set up necessary directories and configurations:
os.makedirs('data', exist_ok=True)
session = requests.Session()
retries = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))
nltk.download('punkt')

# Clean title by standardizing the "By H. P. Lovecraft" text:
def clean_title(title):
    author_text = "By H. P. Lovecraft"
    title = re.sub(rf"({author_text}\s*)+", author_text, title).strip()
    if title.endswith(author_text) and not title.endswith(" " + author_text):
        title = title.replace(author_text, " " + author_text)
    return title

# Scraping Lovecraft Fiction:

def scrape_lovecraft_content():
    base_url = "https://www.hplovecraft.com/writings/texts/"
    response = session.get(base_url)
    
    if response.status_code != 200:
        print(f"Failed to access the base URL: {response.status_code}")
        return
    
    soup = BeautifulSoup(response.content, 'html.parser')
    content_links = [
        f"{base_url}{link['href']}"
        for link in soup.find_all('a', href=True)
        if link['href'].startswith('fiction/') and not link['href'].startswith('#')
    ]

    csv_filename = 'data/lovecraft_fiction.csv'
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Content Type', 'Title', 'Text'])

        for content_url in content_links:
            time.sleep(random.uniform(1, 3))
            try:
                content_response = session.get(content_url, headers={'User-Agent': 'Mozilla/5.0'})
                if content_response.status_code == 200:
                    content_soup = BeautifulSoup(content_response.content, 'html.parser')
                    title_tag = content_soup.find('font', size="+2")
                    author_tag = content_soup.find('font', size="+1")
                    text_div = content_soup.find('div', align='justify')

                    if title_tag and text_div:
                        title = f"{title_tag.get_text(strip=True)} by {author_tag.get_text(strip=True)}"
                        title = clean_title(title)  # Clean the title text
                        csvwriter.writerow(["fiction", title, text_div.get_text(strip=True)])
                        print(f'Scraped: {title}')
                    else:
                        print(f'Title or text not found for {content_url}')
                else:
                    print(f'Failed to scrape {content_url}: {content_response.status_code}')
            except Exception as e:
                print(f'Error scraping {content_url}: {e}')

scrape_lovecraft_content()
'''

In [None]:
# Load scraped CSV data into DF:
df = pd.read_csv('data/lovecraft_fiction.csv')

# View first few rows:
print(df.head())


In [None]:
from nltk.corpus import stopwords

# Download stopwords:
nltk.download('stopwords')

# Tokenize and remove stopwords:
def clean_text(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    cleaned_tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stop_words]
    return " ".join(cleaned_tokens)

# Apply to text column in DF:
df['Cleaned_Text'] = df['Text'].apply(clean_text)


In [None]:
from collections import Counter

# Create frequency distribution for words:
all_words = " ".join(df['Cleaned_Text']).split()
word_freq = Counter(all_words)

# Display most common words:
print(word_freq.most_common(100))


In [None]:
'''import spacy
from collections import Counter

# Load spaCy model:
nlp = spacy.load("en_core_web_sm")

# Function to extract entities:
def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text.lower() for ent in doc.ents if ent.label_ in ['PERSON', 'LOC']]
    return entities

# Apply the extraction to the 'Cleaned_Text' column
df['Entities'] = df['Cleaned_Text'].apply(extract_entities)

# Flatten the list of entities and get their frequency count
all_entities = [entity for sublist in df['Entities'] for entity in sublist]
entity_freq = Counter(all_entities)

# Display the most common entities
print(entity_freq.most_common(100))
'''

In [None]:
import spacy
import re
from collections import Counter

# Load spaCy model for Entity Recognition:
nlp = spacy.load("en_core_web_sm")

# List of Lovecraftian entities:
lovecraftian_entities = [
    "cthulhu", "yog-sothoth", "nyarlathotep", "azathoth", "hastur", "r'lyeh", "dagon", 
    "shub-niggurath", "the great old ones", "elder gods", "the old ones", "the deep ones", "night gaunts", 
    "cthulhu cult", "the nameless city", "the black stone", "the dreamlands", "fenric", "hecuba", 
    "animus", "tor-gasukk", "moloch", "kai'lizakia", "lloigor", "eidolon", "derleth", "gog", "magog", "to'koth", 
    "karnas'koi", "traguam", "archon", "mi'en kalarash", "kwundaar", "volund", "k'thun", "noth-yidik", "tru'nembra", 
    "tulzscha", "cxaxukluth", "d'endrrah", "ubbo-sathla", "xexanoth", "ycnàgnnisssz", "yhoundeh", "aiueb gnshal", 
    "aletheia", "azhorra-tha", "c'thalpa", "daoloth", "ghroth", "gi-hoveg", "haiogh-yai", "huitloxopetl", "ialdagorth", 
    "kaajh'kaalbh", "kaalut", "lu-kthu", "mh'ithrha", "mlandoth", "mril thorion", "mother of pus", "nhimbaloth", 
    "ngyr-khorath", "nyctelios", "olkoth", "ramasekva", "shabbith-ka", "star mother", "suc'naath", "uvhash", "xa'ligha", 
    "yibb-tstll", "yidhra", "yomag'n'tho"
]

# Indirect references or variations (these should be generalized and defined outside the loop for efficiency)
indirect_references = [
    r"\bdeep ones\b", r"\bcosmic entity\b", r"\bhorrible being\b", r"\bnight gaunts\b", r"\bblack stone\b",
    r"\byog sothoth\b", r"\bnamesless city\b", r"\bstrange entity\b", r"\botherworldly creature\b", r"\bdark god\b",
    r"\bhorrible power\b", r"\btimeless one\b"
]

# Function to extract entities
def extract_entities(text):
    doc = nlp(text)
    
    # Initialize the list to store extracted entities
    entities = []

    # Manually add expanded entities and regex pattern matches for direct references
    for entity in lovecraftian_entities:
        # Check for direct entity mentions (singular and plural)
        singular_entity = r'\b' + re.escape(entity) + r'\b'
        plural_entity = r'\b' + re.escape(entity + "s") + r'\b'  # Handle plural form

        if re.search(singular_entity, text.lower()) or re.search(plural_entity, text.lower()):
            entities.append(entity)
        
        # Check for indirect references or variations
        for pattern in indirect_references:
            if re.search(pattern, text.lower()):
                entities.append(entity)

    return entities

# Apply the extraction to the 'Cleaned_Text' column (assuming 'Cleaned_Text' contains the content)
df['Entities'] = df['Cleaned_Text'].apply(extract_entities)

# Flatten the list of entities and get their frequency count
all_entities = [entity for sublist in df['Entities'] for entity in sublist]
entity_freq = Counter(all_entities)

# Display the most common entities
print(entity_freq.most_common(100))

# Custom list of Lovecraftian entities to track (including new entities)
specific_entity_freq = {entity: all_entities.count(entity) for entity in lovecraftian_entities}

# Display the count of these specific entities
print(specific_entity_freq)


In [None]:
from collections import Counter

# Count the mentions of each entity across all rows in 'Entities' column
all_entities = [entity for sublist in df['Entities'] for entity in sublist]
entity_mentions = Counter(all_entities)

# Convert the Counter object to a list of tuples (entity, mentions)
entities = [(entity, count) for entity, count in entity_mentions.items()]

# Function to filter and categorize entities:
def filter_lovecraft_entities(entities, exclude_humans=True):

    # Dictionary to store entities by category:
    categories = {
        'Cthulhu Mythos': [],
        'Locations & Settings': [],
        'Cosmic Entities': [],
        'Occult Entities': [],
        'Mythos-Related Concepts': []
    }

    # Known classifications:
    mythos_deities = {'cthulhu', 'great cthulhu', 'nyarlathotep', 'azathoth', 'shub-niggurath', 
                      'dagon', 'yog-sothoth', 'hastur', 'ubbo-sathla', 'ghroth', 'ycnàgnnisssz'}
    
    locations = {'arkham', 'miskatonic', 'innsmouth', 'dunwich', "r'lyeh", 'the dreamlands', 'the nameless city'}
    
    cosmic_entities = {'elder gods', 'the old ones', 'the great old ones', 'night gaunts', 'the deep ones', 
                       'colour out of space', 'yog sothoth', 'tulzscha', 'cxaxukluth', 'yhoundeh', 
                       'aiueb gnshal', 'aletheia', 'azhorra-tha', "c'thalpa", 'daoloth', 'ghroth', 'gi-hoveg', 
                       'haiogh-yai', 'huitloxopetl', 'ialdagorth', "kaajh'kaalbh", 'kaalut', 'lu-kthu', 
                       "mh'ithrha", 'mlandoth', 'mril thorion', 'mother of pus', 'nhimbaloth', 
                       'ngyr-khorath', 'nyctelios', 'olkoth', 'ramasekva', 'shabbith-ka', 'star mother', 
                       "suc'naath", 'uvhash', "xa'ligha", 'yibb-tstll', 'yidhra', "yomag'n'tho"}
    
    occult_entities = {'toymakers', 'guardians of time', 'the great intelligence', 'moloch', 
                       'hecuba', 'animus', 'archon', 'kwundaar', 'volund', 'noth-yidik', 'tru\'nembra'}

    # Categorize entities:
    for entity, mentions in entities:

        if entity in mythos_deities:
            categories['Cthulhu Mythos'].append((entity, mentions))
        elif entity in locations:
            categories['Locations & Settings'].append((entity, mentions))
        elif entity in cosmic_entities:
            categories['Cosmic Entities'].append((entity, mentions))
        elif entity in occult_entities:
            categories['Occult Entities'].append((entity, mentions))
        else:
            categories['Mythos-Related Concepts'].append((entity, mentions))

    return categories

# Apply filter:
filtered_entities = filter_lovecraft_entities(entities)

# Print result:
for category, entities_list in filtered_entities.items():
    print(f"--- {category} ---")
    for entity, mentions in entities_list:
        print(f"{entity}: {mentions}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Count mentions of each entity across all rows in Entities column:
all_entities = [entity for sublist in df['Entities'] for entity in sublist]
entity_mentions = Counter(all_entities)

# Convert Counter object to DF (top 10):
entity_df = pd.DataFrame(entity_mentions.most_common(10), columns=['Entity', 'Frequency'])

# Plot top 10 entities:
plt.figure(figsize=(10, 6))
sns.barplot(x='Frequency', y='Entity', data=entity_df, palette='viridis', hue='Entity')
plt.title('Top 10 Lovecraftian Entities')
plt.xlabel('Frequency of Mentions')
plt.ylabel('Entity')
plt.show()
