# Task 1

In moodle you will find the file ASoIaF.zip. It contains the five books of the “A song of ice
and fire” series in a plain txt-format. Load all files into your console.

We are interested in how the story and its themes develop over time. For this, we will train a
topic model on each book and compare them. For Python-users, you will find a better function
to show the top-words in utils.py.


In [2]:
# Simplifying the code for loading all files into the console

# Paths to the text files for each book
file_paths = [
    "ASoIaF/ASoIaF/001ssb.txt",  # A Game of Thrones
    "ASoIaF/ASoIaF/002ssb.txt",  # A Clash of Kings
    "ASoIaF/ASoIaF/003ssb.txt",  # A Storm of Swords
    "ASoIaF/ASoIaF/004ssb.txt",  # A Feast for Crows
    "ASoIaF/ASoIaF/005ssb.txt"   # A Dance with Dragons
]

# Initialize a list to hold the content of each book
books = []

# Load the content of each book into the list
for file_path in file_paths:
    with open(file_path, 'r', encoding='ISO-8859-1') as file:
        books.append(file.read())

# Lengths of each book to confirm they've been loaded
[len(book) for book in books]


[1607894, 1752880, 2273275, 1614153, 2277994]

In [None]:
###

In [3]:
# To clean the texts and split them into chapters, we first need to identify common patterns
# for chapter starts or any unwanted fragments to remove. Given the variability in formatting,
# a common approach is to look for chapter headings, which might be consistently formatted.

# Define a function to clean the text and split it into chapters
def clean_and_split(text):
    # Attempt to identify chapter headings and unwanted fragments
    # Assuming chapters might start with "CHAPTER", "Chapter", or a similar keyword
    # and are possibly followed by a chapter title or number
    chapters = []
    current_chapter = []
    
    for line in text.split('\n'):
        # Check if the line indicates the start of a new chapter
        if line.startswith("CHAPTER") or line.startswith("Chapter") or line.startswith("PROLOGUE"):
            # If there's an existing chapter, save it and start a new one
            if current_chapter:
                chapters.append("\n".join(current_chapter))
                current_chapter = [line]
            else:
                current_chapter.append(line)
        else:
            current_chapter.append(line)
    
    # Add the last chapter to the list, if it exists
    if current_chapter:
        chapters.append("\n".join(current_chapter))
    
    return chapters

# Clean each book's text and split into chapters
cleaned_books = [clean_and_split(book) for book in books]

# Flatten the list of lists into a single list of chapters, preserving chronological order
chapters = [chapter for book in cleaned_books for chapter in book]

# Verify by displaying the number of chapters found in each book and total
chapter_counts_per_book = [len(book) for book in cleaned_books]
total_chapters = len(chapters)

chapter_counts_per_book, total_chapters


([2, 69, 82, 2, 3], 158)

In [None]:
###

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define a function for text preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

# Preprocess all chapters
preprocessed_chapters = [preprocess_text(chapter) for chapter in chapters]

# Example: Display the first 100 tokens of the first preprocessed chapter
preprocessed_chapters[0][:100]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['game',
 'throne',
 'book',
 'one',
 'song',
 'ice',
 'fire',
 'george',
 'r',
 'r',
 'martin']

In [5]:
###

In [6]:
# Before training LDA models, we need to ensure the text is properly preprocessed
# Given the limitations in downloading NLTK data, we'll proceed with a simplified preprocessing

# Let's focus on the very first book for this task
first_book = books[0]

# Simplified preprocessing for the first book (lowercasing, removing punctuation)
def preprocess_text_simple(text):
    text = text.lower()  # Lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenization
    return tokens

tokens_first_book = preprocess_text_simple(first_book)

# For LDA, we need to create a document-term matrix
from gensim import corpora, models
import gensim

# Create a Dictionary from the tokens
dictionary = corpora.Dictionary([tokens_first_book])

# Convert to document-term matrix
corpus = [dictionary.doc2bow(tokens_first_book)]

# Set parameters
num_topics = 10
passes = 50

# Train five LDA models with K=10 and 50 iterations on the first book
ldas = [gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes) for _ in range(5)]

# Display the topics from each model to compare
for i, lda in enumerate(ldas, start=1):
    print(f"Model {i} Topics:")
    for topic in lda.print_topics(num_words=5):
        print(topic)
    print("\n")


Model 1 Topics:
(0, '0.000*"the" + 0.000*"and" + 0.000*"a" + 0.000*"to" + 0.000*"of"')
(1, '0.000*"the" + 0.000*"and" + 0.000*"to" + 0.000*"a" + 0.000*"of"')
(2, '0.000*"the" + 0.000*"and" + 0.000*"to" + 0.000*"of" + 0.000*"he"')
(3, '0.000*"the" + 0.000*"and" + 0.000*"a" + 0.000*"he" + 0.000*"his"')
(4, '0.060*"the" + 0.030*"and" + 0.022*"to" + 0.022*"a" + 0.020*"of"')
(5, '0.000*"the" + 0.000*"and" + 0.000*"to" + 0.000*"his" + 0.000*"her"')
(6, '0.000*"the" + 0.000*"and" + 0.000*"a" + 0.000*"his" + 0.000*"her"')
(7, '0.000*"the" + 0.000*"to" + 0.000*"and" + 0.000*"a" + 0.000*"of"')
(8, '0.000*"the" + 0.000*"and" + 0.000*"of" + 0.000*"a" + 0.000*"to"')
(9, '0.000*"the" + 0.000*"and" + 0.000*"a" + 0.000*"to" + 0.000*"his"')


Model 2 Topics:
(0, '0.000*"the" + 0.000*"and" + 0.000*"to" + 0.000*"of" + 0.000*"a"')
(1, '0.000*"the" + 0.000*"to" + 0.000*"he" + 0.000*"a" + 0.000*"and"')
(2, '0.000*"the" + 0.000*"of" + 0.000*"to" + 0.000*"and" + 0.000*"his"')
(3, '0.000*"the" + 0.000*"and" + 

In [7]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import string
import nltk

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Define the advanced preprocessing function
def preprocess_text_advanced(text):
    # Extend the stop words list
    stop_words = set(stopwords.words('english'))
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize
    tokens = [w for w in tokens if w not in stop_words]  # Remove stop words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]  # Lemmatization
    # POS tagging and filtering nouns and adjectives
    tokens = [word for word, tag in pos_tag(tokens) if tag.startswith('NN') or tag.startswith('JJ')]
    return tokens

# Load and preprocess the first book
# Assuming 'first_book_text' contains the raw text of the first book
first_book_text = books[0]  # Replace with the appropriate variable if different
tokens = preprocess_text_advanced(first_book_text)

# Create the Dictionary and Corpus
dictionary = corpora.Dictionary([tokens])
corpus = [dictionary.doc2bow(tokens)]

# Train five LDA models
ldas = [LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=50) for _ in range(5)]

# Compare the topics from these models
for i, lda in enumerate(ldas, start=1):
    print(f"Model {i} Topics:")
    for topic in lda.show_topics(num_words=5):
        print(topic)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Model 1 Topics:
(0, '0.000*"lord" + 0.000*"ser" + 0.000*"hand" + 0.000*"man" + 0.000*"men"')
(1, '0.000*"ser" + 0.000*"lord" + 0.000*"jon" + 0.000*"hand" + 0.000*"eye"')
(2, '0.000*"lord" + 0.000*"hand" + 0.000*"ser" + 0.000*"man" + 0.000*"eye"')
(3, '0.000*"lord" + 0.000*"jon" + 0.000*"eye" + 0.000*"hand" + 0.000*"man"')
(4, '0.000*"lord" + 0.000*"ser" + 0.000*"jon" + 0.000*"man" + 0.000*"hand"')
(5, '0.000*"lord" + 0.000*"ser" + 0.000*"man" + 0.000*"jon" + 0.000*"eye"')
(6, '0.014*"lord" + 0.009*"ser" + 0.008*"hand" + 0.008*"man" + 0.008*"jon"')
(7, '0.000*"lord" + 0.000*"ser" + 0.000*"jon" + 0.000*"hand" + 0.000*"man"')
(8, '0.000*"lord" + 0.000*"ser" + 0.000*"man" + 0.000*"hand" + 0.000*"jon"')
(9, '0.000*"lord" + 0.000*"ser" + 0.000*"man" + 0.000*"tyrion" + 0.000*"hand"')
Model 2 Topics:
(0, '0.000*"lord" + 0.000*"hand" + 0.000*"jon" + 0.000*"man" + 0.000*"eye"')
(1, '0.000*"lord" + 0.000*"ser" + 0.000*"hand" + 0.000*"jon" + 0.000*"tyrion"')
(2, '0.000*"lord" + 0.000*"man" + 0.000

In [9]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

# Assuming the text has been preprocessed with advanced techniques

# Tokenize, stop word removal, lemmatization, and POS filtering for the first book
tokens = preprocess_text_advanced(first_book_text)  # Use the advanced preprocessing function provided

# Create a Dictionary and Corpus for LDA
dictionary = Dictionary([tokens])
corpus = [dictionary.doc2bow(text) for text in [tokens]]

# Model parameters
num_topics = 10
passes = 50
alpha = 'auto'  # Let the model automatically learn the optimal alpha
eta = 'auto'    # Corrected parameter for topic-word distribution

# Train five LDA models with the adjusted parameters
ldas = [LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, alpha=alpha, eta=eta) for _ in range(5)]

# Compare the topics
for i, lda in enumerate(ldas, start=1):
    print(f"Model {i} Topics:")
    for topic in lda.show_topics(num_words=5):
        print(topic)


Model 1 Topics:
(0, '0.000*"lord" + 0.000*"ser" + 0.000*"man" + 0.000*"hand" + 0.000*"jon"')
(1, '0.000*"ser" + 0.000*"lord" + 0.000*"man" + 0.000*"eye" + 0.000*"hand"')
(2, '0.000*"lord" + 0.000*"jon" + 0.000*"ser" + 0.000*"hand" + 0.000*"man"')
(3, '0.000*"lord" + 0.000*"hand" + 0.000*"ser" + 0.000*"man" + 0.000*"jon"')
(4, '0.014*"lord" + 0.009*"ser" + 0.008*"hand" + 0.008*"man" + 0.008*"jon"')
(5, '0.000*"lord" + 0.000*"hand" + 0.000*"man" + 0.000*"men" + 0.000*"jon"')
(6, '0.000*"lord" + 0.000*"ser" + 0.000*"hand" + 0.000*"jon" + 0.000*"tyrion"')
(7, '0.000*"lord" + 0.000*"ser" + 0.000*"men" + 0.000*"hand" + 0.000*"man"')
(8, '0.000*"lord" + 0.000*"hand" + 0.000*"man" + 0.000*"ser" + 0.000*"tyrion"')
(9, '0.000*"lord" + 0.000*"ser" + 0.000*"man" + 0.000*"hand" + 0.000*"jon"')
Model 2 Topics:
(0, '0.000*"lord" + 0.000*"ser" + 0.000*"man" + 0.000*"jon" + 0.000*"hand"')
(1, '0.000*"lord" + 0.000*"hand" + 0.000*"jon" + 0.000*"tyrion" + 0.000*"man"')
(2, '0.000*"lord" + 0.000*"ser" + 0

In [10]:
####

In [11]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Ensure the necessary NLTK data is available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Assuming each book's text is loaded into a list 'books' where each element is the text of one book

# Advanced preprocessing function (as defined earlier)
def preprocess_text_advanced(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    tokens = [word for word, tag in pos_tag(tokens) if tag.startswith('NN') or tag.startswith('JJ')]
    return tokens

# Train an LDA model for each book
lda_models = []
for book_text in books:
    tokens = preprocess_text_advanced(book_text)
    dictionary = Dictionary([tokens])
    corpus = [dictionary.doc2bow(tokens)]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=50)
    lda_models.append(lda)

# To compare the models, examine the topics generated for each book
for i, lda_model in enumerate(lda_models, start=1):
    print(f"Book {i} Topics:")
    for topic in lda_model.show_topics(num_words=5):
        print(topic)
    print("\n")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Book 1 Topics:
(0, '0.000*"lord" + 0.000*"man" + 0.000*"ser" + 0.000*"hand" + 0.000*"jon"')
(1, '0.000*"lord" + 0.000*"ser" + 0.000*"hand" + 0.000*"man" + 0.000*"page"')
(2, '0.014*"lord" + 0.009*"ser" + 0.008*"hand" + 0.008*"man" + 0.008*"jon"')
(3, '0.000*"lord" + 0.000*"jon" + 0.000*"ser" + 0.000*"man" + 0.000*"hand"')
(4, '0.000*"lord" + 0.000*"man" + 0.000*"hand" + 0.000*"jon" + 0.000*"eye"')
(5, '0.000*"lord" + 0.000*"ser" + 0.000*"hand" + 0.000*"men" + 0.000*"man"')
(6, '0.000*"lord" + 0.000*"ser" + 0.000*"jon" + 0.000*"man" + 0.000*"hand"')
(7, '0.000*"lord" + 0.000*"jon" + 0.000*"hand" + 0.000*"man" + 0.000*"ser"')
(8, '0.000*"lord" + 0.000*"man" + 0.000*"jon" + 0.000*"hand" + 0.000*"ser"')
(9, '0.000*"lord" + 0.000*"man" + 0.000*"hand" + 0.000*"ser" + 0.000*"jon"')


Book 2 Topics:
(0, '0.000*"lord" + 0.000*"man" + 0.000*"ser" + 0.000*"hand" + 0.000*"men"')
(1, '0.014*"lord" + 0.009*"man" + 0.008*"ser" + 0.007*"men" + 0.007*"hand"')
(2, '0.000*"lord" + 0.000*"ser" + 0.000*"ma