In [None]:
import requests
import json
import pandas as pd
import re
import time
import os
import random
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
import csv

# Set up necessary directories and configurations:
os.makedirs('data', exist_ok=True)
session = requests.Session()
retries = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))
nltk.download('punkt')

# Clean title by standardizing the "By H. P. Lovecraft" text:
def clean_title(title):
    author_text = "By H. P. Lovecraft"
    title = re.sub(rf"({author_text}\s*)+", author_text, title).strip()
    if title.endswith(author_text) and not title.endswith(" " + author_text):
        title = title.replace(author_text, " " + author_text)
    return title

# --- Step 1: Scraping Lovecraft Works ---

def scrape_lovecraft_content(content_type):
    base_url = "https://www.hplovecraft.com/writings/texts/"
    response = session.get(base_url)
    
    if response.status_code != 200:
        print(f"Failed to access the base URL: {response.status_code}")
        return
    
    soup = BeautifulSoup(response.content, 'html.parser')
    content_links = [
        f"{base_url}{link['href']}"
        for link in soup.find_all('a', href=True)
        if link['href'].startswith(f'{content_type}/') and not link['href'].startswith('#')
    ]

    csv_filename = f'data/lovecraft_{content_type}.csv'
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Content Type', 'Title', 'Text'])

        for content_url in content_links:
            time.sleep(random.uniform(1, 3))
            try:
                content_response = session.get(content_url, headers={'User-Agent': 'Mozilla/5.0'})
                if content_response.status_code == 200:
                    content_soup = BeautifulSoup(content_response.content, 'html.parser')
                    title_tag = content_soup.find('font', size="+2")
                    author_tag = content_soup.find('font', size="+1")
                    text_div = content_soup.find('div', align='justify')

                    if title_tag and text_div:
                        title = f"{title_tag.get_text(strip=True)} by {author_tag.get_text(strip=True)}"
                        title = clean_title(title)  # Clean the title text
                        csvwriter.writerow([content_type, title, text_div.get_text(strip=True)])
                        print(f'Scraped: {title}')
                    else:
                        print(f'Title or text not found for {content_url}')
                else:
                    print(f'Failed to scrape {content_url}: {content_response.status_code}')
            except Exception as e:
                print(f'Error scraping {content_url}: {e}')

# Scrape all specified content types:
for content in ['fiction', 'poetry', 'essays', 'letters']:
    scrape_lovecraft_content(content)

# --- Step 2: API Data Collection ---

def fetch_and_save_json(api_url, filename):
    response = session.get(api_url)
    if response.status_code == 200:
        with open(f'data/{filename}.json', 'w', encoding='utf-8') as file:
            json.dump(response.json(), file, ensure_ascii=False, indent=4)
        print(f"{filename.capitalize()} data saved successfully!")
    else:
        print(f"Failed to retrieve {filename} data: {response.status_code}")

api_categories = {
    "creatures": "https://lovecraftapirest.fly.dev/api/creatures",
    "races": "https://lovecraftapirest.fly.dev/api/races",
    "outer_gods": "https://lovecraftapirest.fly.dev/api/categories/outer-gods",
    "great_old_ones": "https://lovecraftapirest.fly.dev/api/categories/great-old-ones",
    "lesser_old_ones": "https://lovecraftapirest.fly.dev/api/categories/lesser-old-ones"
}

for name, url in api_categories.items():
    fetch_and_save_json(url, name)


In [None]:
import json
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

# --- Step 3: Load and Clean Data ---
# Load JSON files containing Lovecraftian entities:
def load_json_data(file_path):
    """Load JSON data from a given file path."""
    with open(file_path) as f:
        return json.load(f)

creatures = load_json_data('data/creatures.json')
great_old_ones = load_json_data('data/great_old_ones.json')
lesser_old_ones = load_json_data('data/lesser_old_ones.json')
outer_gods = load_json_data('data/outer_gods.json')
races = load_json_data('data/races.json')

# Load Lovecraft fiction CSV data:
lovecraft_data = pd.read_csv('data/lovecraft_fiction.csv')

# --- Step 4: Combine Entity Data ---
# Create list of all entities with their type for easy tracking:
def create_entity_list(creatures, great_old_ones, lesser_old_ones, outer_gods, races):
    """Combine all Lovecraftian entities into a single list with their types."""
    all_entities = []
    all_entities.extend([{'name': creature['name'], 'type': 'Creature'} for creature in creatures])
    all_entities.extend([{'name': old_one['name'], 'type': 'Great Old One'} for old_one in great_old_ones])
    all_entities.extend([{'name': old_one['name'], 'type': 'Lesser Old One'} for old_one in lesser_old_ones])
    all_entities.extend([{'name': old_one['name'], 'type': 'Outer God'} for old_one in outer_gods])
    all_entities.extend([{'name': race['race'], 'type': 'Race'} for race in races])
    return all_entities

all_entities = create_entity_list(creatures, great_old_ones, lesser_old_ones, outer_gods, races)

# --- Step 5: Clean Entity Names ---
# Normalize the entity names and merge duplicates:
def clean_name(name):
    """Clean and standardize entity names to merge duplicates."""
    name_corrections = {
        'ghoul': 'ghoul',
        'dark young': 'dark young of shub-niggurath',
        'deep one': 'deep one',
        'dimensional shambler': 'dimensional shambler',
        'elder thing': 'elder thing',
        'flying polyp': 'flying polyp',
        'ghast': 'ghast',
        'gug': 'gug',
        'hound of tindalos': 'hound of tindalos',
        'moon-beast': 'moon-beast',
        'night-gaunt': 'night-gaunt',
        'shoggoth': 'shoggoth',
        'spider of leng': 'spider of leng',
        'star spawn of cthulhu': 'star spawn of cthulhu',
        'tcho-tcho': 'tcho-tcho'
    }
    for singular, corrected in name_corrections.items():
        if singular in name:
            return corrected
    return name

# Apply cleaning function to entity names:
all_names = [clean_name(entity['name'].lower()) for entity in all_entities]

# Remove unwanted terms from the list:
unwanted_terms = ['han', 'ghoul', 'darkness']
all_names = [name for name in all_names if name not in unwanted_terms]

# --- Step 6: Filter Texts by Entities ---
def filter_texts(data_frame, names):
    """Filter Lovecraft texts based on a list of entity names."""
    return [text for text in data_frame['Text'] if any(name in text.lower() for name in names)]

filtered_texts = filter_texts(lovecraft_data, all_names)

# --- Step 7: Count Entity Occurrences ---
# Count how often each entity appears in the filtered texts:
name_counts = Counter()
for text in filtered_texts:
    for name in all_names:
        name_counts[name] += text.lower().count(name)

# Create a DataFrame to store name counts:
name_counts_df = pd.DataFrame(name_counts.items(), columns=['Name', 'Count'])

# Add 'Type' column to identify each entity's type:
name_counts_df['Type'] = name_counts_df['Name'].apply(
    lambda name: next((entity['type'] for entity in all_entities if entity['name'].lower() == name), 'Unknown')
)

# Save the name counts and types to a CSV file:
name_counts_df.to_csv('data/lovecraft_name_counts.csv', index=False)

# --- Step 8: Basic Visualization ---
def plot_top_entities(counts_df, top_n=10, exclude=None):
    """Plot a bar chart of the top N entities by occurrence count, excluding specified terms."""
    exclude = exclude or []
    filtered_df = counts_df[~counts_df['Name'].isin(exclude)]
    top_entities = filtered_df.nlargest(top_n, 'Count')

    plt.figure(figsize=(12, 6))
    plt.bar(top_entities['Name'], top_entities['Count'], color='purple')
    plt.title('Top Lovecraftian Entities by Count')
    plt.xlabel('Entity Names')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.grid(axis='y')
    plt.tight_layout()
    plt.show()

# Visualize the top 20 entities, excluding 'darkness':
plot_top_entities(name_counts_df, top_n=20, exclude=['darkness'])


In [4]:
import pandas as pd

# --- Step 1: Load Data ---
def load_name_counts(file_path):
    """Load Lovecraft name counts from a CSV file."""
    return pd.read_csv(file_path)

name_counts_df = load_name_counts('data/lovecraft_name_counts.csv')

# --- Step 2: Show Most and Least Common Entities ---
def show_top_and_least_common_entities(counts_df, top_n=10):
    """
    Display the top and least common entities based on their occurrence counts.
    
    Args:
        counts_df (DataFrame): DataFrame containing entity name counts.
        top_n (int): Number of top and least common entities to display.
    """
    # Top N most common entities:
    top_entities = counts_df.sort_values(by='Count', ascending=False).head(top_n)
    print("Top Entities:")
    print(top_entities)
    
    # Top N least common entities:
    least_common_entities = counts_df.sort_values(by='Count', ascending=True).head(top_n)
    print("\nLeast Common Entities:")
    print(least_common_entities)

# Show the top 10 and least common 10 entities:
show_top_and_least_common_entities(name_counts_df, top_n=10)


Top Entities:
             Name  Count      Type
46            iod    296  Creature
99            ubb    274  Creature
29          ghast    256  Creature
61    night-gaunt    140  Creature
15        cthulhu     98  Creature
65   nyarlathotep     94  Creature
39            gug     80  Creature
30    ghatanothoa     68  Creature
115         zoogs     64  Creature
112   yog-sothoth     62  Creature

Least Common Entities:
               Name  Count      Type
0    'umr at-tawill      0  Creature
1            abhoth      0  Creature
2      apocolothoth      0  Creature
3     atlach-natcha      0  Creature
5           basatan      0  Creature
7           byakhee      0  Creature
9   cat from saturn      0  Creature
8            byatis      0  Creature
14          cthugha      0  Creature
12        chthonian      0  Creature


In [5]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback

# --- Step 1: Load and Preprocess Data ---
def load_and_prepare_data(file_path, seq_length=5):
    """
    Load the dataset, clean and tokenize names, and prepare input-output sequences.
    
    Args:
        file_path (str): Path to the CSV file containing entity names.
        seq_length (int): Length of character sequences used to predict the next character.
        
    Returns:
        X (np.array): Input data for the LSTM model.
        y (np.array): One-hot encoded target labels.
        char_to_idx (dict): Mapping of characters to indices.
        idx_to_char (dict): Reverse mapping of indices to characters.
    """
    # Load CSV and extract 'Name' column:
    df = pd.read_csv(file_path)
    names = df['Name'].tolist()

    # Clean names by removing spaces and prepare vocab:
    names = [name.replace(" ", "") for name in names]
    all_chars = ''.join(names)
    vocab = sorted(set(all_chars))
    
    char_to_idx = {char: idx for idx, char in enumerate(vocab)}
    idx_to_char = {idx: char for char, idx in char_to_idx.items()}

    # Generate input-output sequences:
    X, y = [], []
    for name in names:
        for i in range(len(name) - seq_length):
            X.append([char_to_idx[char] for char in name[i:i + seq_length]])
            y.append(char_to_idx[name[i + seq_length]])

    # Convert to numpy arrays and normalize:
    X = np.array(X).reshape((len(X), seq_length, 1)) / len(vocab)
    y = tf.keras.utils.to_categorical(np.array(y), num_classes=len(vocab))
    
    return X, y, char_to_idx, idx_to_char

# Load and prepare the data:
X, y, char_to_idx, idx_to_char = load_and_prepare_data('data/lovecraft_name_counts.csv')

# --- Step 2: Build and Compile LSTM Model ---
def build_lstm_model(input_shape, vocab_size, lstm_units=128, dropout_rate=0.2):
    """
    Build and compile an LSTM model for character-level text generation.
    
    Args:
        input_shape (tuple): Shape of the input data.
        vocab_size (int): Size of the vocabulary (number of unique characters).
        lstm_units (int): Number of units in the LSTM layers.
        dropout_rate (float): Dropout rate to prevent overfitting.
        
    Returns:
        model (tf.keras.Model): Compiled LSTM model.
    """
    model = Sequential([
        LSTM(lstm_units, input_shape=input_shape, return_sequences=True),
        Dropout(dropout_rate),
        LSTM(lstm_units),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

# Build model:
model = build_lstm_model(X.shape[1:], len(char_to_idx))

# --- Step 3: Define Custom Callback ---
class HighestAccuracyCallback(Callback):
    """
    Custom callback to track and display the highest accuracy during training.
    """
    def __init__(self):
        super().__init__()
        self.highest_accuracy = 0.0
    
    def on_epoch_end(self, epoch, logs=None):
        current_accuracy = logs.get("accuracy")
        if current_accuracy and current_accuracy > self.highest_accuracy:
            self.highest_accuracy = current_accuracy
    
    def on_train_end(self, logs=None):
        print(f"Highest accuracy achieved: {self.highest_accuracy:.4f}")

# Instantiate callback:
highest_accuracy_callback = HighestAccuracyCallback()

# --- Step 4: Train Model ---
model.fit(X, y, epochs=1000, batch_size=64, callbacks=[highest_accuracy_callback])

# --- Step 5: Generate Entity Names ---
def generate_entity_name(length=10, temperature=1.0):
    """
    Generate a random entity name using the trained LSTM model with a temperature sampling method.
    
    Args:
        length (int): Length of the generated name.
        temperature (float): Temperature parameter to adjust randomness in character sampling.
        
    Returns:
        generated_name (str): The generated entity name.
    """
    invalid_chars = ["'", "-", " "]  # Characters that should not appear at the start or end.

    # Initialize the seed with a random character:
    start_idx = random.randint(0, len(char_to_idx) - 1)
    seed = idx_to_char[start_idx]
    
    # Ensure the name doesn't start with an invalid character:
    while seed in invalid_chars:
        start_idx = random.randint(0, len(char_to_idx) - 1)
        seed = idx_to_char[start_idx]

    encoded_seed = [start_idx]  # Start seed as an index.
    generated_name = seed  # Initialize the generated name with the seed character.
    
    # Fill seed if it's too short:
    while len(encoded_seed) < 5:  # Ensure seed length is at least seq_length:
        encoded_seed.append(random.randint(0, len(char_to_idx) - 1))

    # Generate characters:
    for _ in range(length):
        input_seq = np.array(encoded_seed[-5:]).reshape(1, 5, 1) / len(char_to_idx)
        predicted_prob = model.predict(input_seq, verbose=0)[0]

        # Apply temperature to predictions:
        predicted_prob = np.log(predicted_prob + 1e-7) / temperature
        predicted_prob = np.exp(predicted_prob) / np.sum(np.exp(predicted_prob))

        predicted_char_idx = np.random.choice(len(predicted_prob), p=predicted_prob)
        predicted_char = idx_to_char[predicted_char_idx]
        
        generated_name += predicted_char
        encoded_seed.append(predicted_char_idx)

    # Post-process to remove consecutive duplicates:
    def remove_consecutive_duplicates(name):
        result = [name[0]]
        for char in name[1:]:
            if result[-1] != char:
                result.append(char)
        return ''.join(result)

    generated_name = remove_consecutive_duplicates(generated_name).strip()
    generated_name = generated_name.capitalize()

    # Validate apostrophe placement:
    if "'" in generated_name:
        parts = generated_name.split("'")
        if len(parts) > 2 or parts[0] == "" or parts[-1] == "":
            return generate_entity_name(length, temperature)
        parts[1] = parts[1].lower()
        generated_name = "'".join(parts)

    # Validate that name doesn't end with invalid characters:
    if generated_name[-1] in invalid_chars:
        return generate_entity_name(length, temperature)

    return generated_name

# Generate and display a sample name:
new_entity = generate_entity_name(length=10, temperature=0.7)
print(f"Sample Name: {new_entity}")

# Save trained model:
model.save('final_model.keras')



Epoch 1/1000


  super().__init__(**kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.0693 - loss: 3.3095
Epoch 2/1000
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0799 - loss: 3.1258 
Epoch 3/1000
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0863 - loss: 2.9606 
Epoch 4/1000
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1071 - loss: 2.8961 
Epoch 5/1000
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0988 - loss: 2.9295 
Epoch 6/1000
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0851 - loss: 2.9214 
Epoch 7/1000
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1087 - loss: 2.9273 
Epoch 8/1000
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1154 - loss: 2.8880 
Epoch 9/1000
[1m12/12[0m [32m━━━━━━━━━━━━━━

In [7]:
# Define possible attributes for to-be generated entities:
races = ["Elder God", "Shoggoth", "Night Gaunt", "Deep One", "Great Old One", "Cosmic Horror"]
powers = ["Mighty", "Weak", "Ancient", "Vast", "Primordial", "Frail"]
domains = ["Cosmic Abyss", "The Void", "Earth's Oceans", "Dream Realms", "The Stars", "Dark Cosmos"]
physical_traits = ["Amorphous", "Tentacled", "Winged", "Eyeless", "Eyes of Madness", "Unseen"]
alignments = ["Malevolent", "Neutral", "Indifferent", "Benevolent"]

# --- Step 1: Generate Random Set Of Attributes ---
def generate_entity_attributes():
    race = random.choice(races)  # Race.
    power = random.choice(powers)  # Power.
    domain = random.choice(domains)  # Domain.
    traits = random.sample(physical_traits, 2)  # Phyiscal traits x 2.
    alignment = random.choice(alignments)  # Alignment.
    
    return {
        "race": race,
        "power": power,
        "domain": domain,
        "physical_traits": traits,
        "alignment": alignment
    }

# --- Step 2: Generate Entity Name With Random Attributes ---
def generate_entity_name_with_attributes(length=10, temperature=1.0):
    generated_name = generate_entity_name(length=length, temperature=temperature)
    attributes = generate_entity_attributes()
    
    # Bundle name and attributes into a dictionary:
    entity_details = {
        "name": generated_name,
        "race": attributes["race"],
        "power": attributes["power"],
        "domain": attributes["domain"],
        "physical_traits": ', '.join(attributes["physical_traits"]),  # Convert traits list to a string
        "alignment": attributes["alignment"]
    }
    
    return entity_details

# --- Step 3: Generate Entity And Print Its Details ---
new_entity = generate_entity_name_with_attributes(length=10, temperature=1)
print(f"New Creature: {new_entity['name']}")
print(f"Race: {new_entity['race']}")
print(f"Power: {new_entity['power']}")
print(f"Domain: {new_entity['domain']}")
print(f"Physical Traits: {new_entity['physical_traits']}")
print(f"Alignment: {new_entity['alignment']}")


New Creature: Khbkoen'tat
Race: Elder God
Power: Primordial
Domain: Earth's Oceans
Physical Traits: Unseen, Amorphous
Alignment: Benevolent
