In [20]:
# run this to shorten the data import from the files
import os
cwd = os.path.dirname(os.getcwd())+'/'
path_data = os.path.join(os.path.dirname(os.getcwd()), 'datasets/')


In [2]:
# exercise 01

"""
Word frequency analysis

Congratulations! You've just joined PyBooks. PyBooks is developing a book recommendation system and they want to find patterns and trends in text to improve their recommendations.

To begin, you'll want to understand the frequency of words in a given text and remove any rare words.

Note that typical real-world datasets will be larger than this example.
"""

# Instructions

"""

    Import the tokenization function from torchtext and frequency distribution function from the nltk library.

    Initialize the tokenizer for English and tokenize the given text.

    Calculate the frequency distribution of the tokens and remove rare words using list comprehension.

"""

# solution

# Import the necessary functions
from torchtext.data.utils import get_tokenizer
from nltk.probability import FreqDist

text = "In the city of Dataville, a data analyst named Alex explores hidden insights within vast data. With determination, Alex uncovers patterns, cleanses the data, and unlocks innovation. Join this adventure to unleash the power of data-driven decisions."

# Initialize the tokenizer and tokenize the text
tokenizer = get_tokenizer("basic_english")
tokens = tokenizer(text)

threshold = 1
# Remove rare words and print common tokens
freq_dist = FreqDist(tokens)
common_tokens = [token for token in tokens if freq_dist[token] > threshold]
print(common_tokens)

#----------------------------------#

# Conclusion

"""
Congratulations! You have removed rare words from your text. It looks like data and alex are pretty common. In practice, you'll work with larger text and may find more meaningful words.
"""

['the', 'of', ',', 'data', 'alex', 'data', '.', ',', 'alex', ',', 'the', 'data', ',', '.', 'the', 'of', '.']


"\nCongratulations! You have removed rare words from your text. It looks like data and alex are pretty common. In practice, you'll work with larger text and may find more meaningful words.\n"

In [3]:
text = 'The moor is very sparsely inhabited, and those who live near each other are thrown very much together. For this reason I saw a good deal of Sir Charles Baskerville. With the exception of Mr. Frankland, of Lafter Hall, and Mr. Stapleton, the naturalist, there are no other men of education within many miles. Sir Charles was a retiring man, but the chance of his illness brought us together, and a community of interests in science kept us so. He had brought back much scientific information from South Africa, and many a charming evening we have spent together discussing the comparative anatomy of the Bushman and the Hottentot.'

In [9]:
import nltk
nltk.download('stopwords')

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/nero/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
# exercise 02

"""
Preprocessing text

Building a recommendation system, or any model, requires text to be preprocessed first.

A block of text from Sherlock Holmes is loaded here. Preprocess this text using the various techniques presented in the video to prepare it for further analysis.

The text variable is an excerpt from The Hound of the Baskervilles by Arther Conan Doyle.

The following packages and functions have been loaded for you: nltk, torch, get_tokenizer, PorterStemmer, stopwords.
"""

# Instructions

"""

    Initialize the tokenizer with "basic_english".
    Tokenize the text using the tokenizer.
---

    Create a set of English stopwords and use list comprehension to filter these stop_words out of the text, making sure to ignore capitalization.
---

    Perform stemming on the filtered_tokens using the appropriate nltk function.

"""

# solution

# Initialize and tokenize the text
tokenizer = get_tokenizer("basic_english")
tokens = tokenizer(text)

# Remove any stopwords
stop_words = set(stopwords.words("english"))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

# Perform stemming on the filtered tokens
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
print(stemmed_tokens)

#----------------------------------#

# Conclusion

"""
Bravo, Sherlock! You've cracked the case of the convoluted text. Now, you have a clean, processed list of words that you can use to analyze the text. You're officially a master in text preprocessing!
"""

['moor', 'spars', 'inhabit', ',', 'live', 'near', 'thrown', 'much', 'togeth', '.', 'reason', 'saw', 'good', 'deal', 'sir', 'charl', 'baskervil', '.', 'except', 'mr', '.', 'frankland', ',', 'lafter', 'hall', ',', 'mr', '.', 'stapleton', ',', 'naturalist', ',', 'men', 'educ', 'within', 'mani', 'mile', '.', 'sir', 'charl', 'retir', 'man', ',', 'chanc', 'ill', 'brought', 'us', 'togeth', ',', 'commun', 'interest', 'scienc', 'kept', 'us', '.', 'brought', 'back', 'much', 'scientif', 'inform', 'south', 'africa', ',', 'mani', 'charm', 'even', 'spent', 'togeth', 'discuss', 'compar', 'anatomi', 'bushman', 'hottentot', '.']


"\nBravo, Sherlock! You've cracked the case of the convoluted text. Now, you have a clean, processed list of words that you can use to analyze the text. You're officially a master in text preprocessing!\n"

In [12]:
# exercise 03

"""
One-hot encoded book titles

PyBooks wants to catalog and analyze the book genres in its library. Apply one-hot encoding to a list of book genres to make them machine-readable.

torch has been imported for you.
"""

# Instructions

"""

    Define the size of the vocabulary and save to vocab_size.

    Create one-hot vectors using the appropriate torch technique and vocab_size.

    Create a dictionary mapping genres to their corresponding one-hot vectors using dictionary comprehension; the dictionary keys should be the genre.

"""

# solution
import torch

genres = ['Fiction','Non-fiction','Biography', 'Children','Mystery']

# Define the size of the vocabulary
vocab_size = len(genres)

# Create one-hot vectors
one_hot_vectors = torch.eye(vocab_size)

# Create a dictionary mapping genres to their one-hot vectors
one_hot_dict = {genre: one_hot_vectors[i] for i, genre in enumerate(genres)}

for genre, vector in one_hot_dict.items():
    print(f'{genre}: {vector.numpy()}')

#----------------------------------#

# Conclusion

"""
Well done! The output matrix represents the presence of genres in a binary format. This type of encoding allows machines to better understand and use the genre data for various tasks, such as predicting book popularity or making book recommendations.
"""

Fiction: [1. 0. 0. 0. 0.]
Non-fiction: [0. 1. 0. 0. 0.]
Biography: [0. 0. 1. 0. 0.]
Children: [0. 0. 0. 1. 0.]
Mystery: [0. 0. 0. 0. 1.]


'\nWell done! The output matrix represents the presence of genres in a binary format. This type of encoding allows machines to better understand and use the genre data for various tasks, such as predicting book popularity or making book recommendations.\n'

In [13]:
import torchtext

In [16]:
# exercise 04

"""
Bag-of-words for book titles

PyBooks now has a list of book titles that need to be encoded for further analysis. The data team believes the Bag of Words (BoW) model could be the best approach.

The following packages have been imported for you: torch, torchtext.
"""

# Instructions

"""

    Import the CountVectorizer class for implementing bag-of-words.

    Initialize an object of the class you imported, then use this object to transform the titles into a matrix representation.

    Extract and display the first five feature names and encoded titles with the get_feature_names_out() method.


"""

# solution

# Import from sklearn
from sklearn.feature_extraction.text import CountVectorizer

titles = ['The Great Gatsby','To Kill a Mockingbird','1984','The Catcher in the Rye','The Hobbit', 'Great Expectations']

# Initialize Bag-of-words with the list of book titles
vectorizer = CountVectorizer()
bow_encoded_titles = vectorizer.fit_transform(titles)

# Extract and print the first five features
print(vectorizer.get_feature_names_out()[:5])
print(bow_encoded_titles.toarray()[0, :5])

#----------------------------------#

# Conclusion

"""
Fantastic work! The output matrix provides a clear picture of the word frequencies in the book titles. By analyzing the output, you can identify the frequency of words like 'catcher' and 'great' in the titles. The word frequency feature vectors can be used later by machine learning algorithms.
"""

['1984' 'catcher' 'expectations' 'gatsby' 'great']
[0 0 0 1 1]


"\nFantastic work! The output matrix provides a clear picture of the word frequencies in the book titles. By analyzing the output, you can identify the frequency of words like 'catcher' and 'great' in the titles. The word frequency feature vectors can be used later by machine learning algorithms.\n"

In [17]:
descriptions = ['A portrait of the Jazz Age in all of its decadence and excess.',
 'A gripping, heart-wrenching, and wholly remarkable tale of coming-of-age in a South poisoned by virulent prejudice.',
 'A startling and haunting vision of the world.',
 'A story of lost innocence.',
 'A timeless adventure story.']

In [18]:
# exercise 05

"""
Applying TF-IDF to book descriptions

PyBooks has collected several book descriptions and wants to identify important words within them using the TF-IDF encoding technique. By doing this, they hope to gain more insights into the unique attributes of each book to help with their book recommendation system.

The following packages have been imported for you: torch, torchtext.
"""

# Instructions

"""

    Import the class from sklearn.feature_extraction.text that converts a collection of raw documents to a matrix of TF-IDF features.

    Instantiate an object of this class, then use this object to encode the descriptions into a TF-IDF matrix of vectors.

    Retrieve and display the first five feature names from the vectorizer and encoded vectors from tfidf_encoded_descriptions.

"""

# solution

# Importing TF-IDF from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF encoding vectorizer
vectorizer = TfidfVectorizer()
tfidf_encoded_descriptions = vectorizer.fit_transform(descriptions)

# Extract and print the first five features
print(vectorizer.get_feature_names_out()[:5])
print(tfidf_encoded_descriptions.toarray()[0, :5])

#----------------------------------#

# Conclusion

"""
Well done! By examining the feature names and their corresponding TF-IDF values, you can uncover significant words that contribute to the uniqueness and relevance of each book. Your team is excited about the insights gained from your analysis. Keep up the great work!
"""

['adventure' 'age' 'all' 'and' 'by']
[0.         0.25943581 0.321564   0.21535516 0.        ]


'\nWell done! By examining the feature names and their corresponding TF-IDF values, you can uncover significant words that contribute to the uniqueness and relevance of each book. Your team is excited about the insights gained from your analysis. Keep up the great work!\n'

In [23]:
with open(path_data+'100-0.txt','r') as f:
    text_data = f.read()
    f.close()

print(text_data[:50])
shakespeare = text_data.split('.')

The Project Gutenberg eBook of The Complete Works


In [25]:
# exercise 06

"""
Shakespearean language preprocessing pipeline

Over at PyBooks, the team wants to transform a vast library of Shakespearean text data for further analysis. The most efficient way to do this is with a text processing pipeline, starting with the preprocessing steps.

The following have been loaded for you: torch, nltk, stopwords, PorterStemmer, get_tokenizer.

The Shakespearean text data is saved as shakespeare and the sentences have already been extracted.
"""

# Instructions

"""

    Create a list of unique English stopwords, saving to them to stop_words.
---

    Initialize the basic_english tokenizer from torch, and PorterStemmer from nltk.
---

    Complete the preprocess_sentences() function to enable tokenization, stop word removal, and stemming.

"""

# solution

# Create a list of stopwords
stop_words = set(stopwords.words("english"))

# Initialize the tokenizer and stemmer
tokenizer = get_tokenizer("basic_english")
stemmer = PorterStemmer() 

# Complete the function to preprocess sentences
def preprocess_sentences(sentences):
    processed_sentences = []
    for sentence in sentences:
        sentence = sentence.lower()
        tokens = tokenizer(sentence)
        tokens = [token for token in tokens if token not in stop_words]
        tokens = [stemmer.stem(token) for token in tokens]
        processed_sentences.append(' '.join(tokens))
    return processed_sentences

processed_shakespeare = preprocess_sentences(shakespeare)
print(processed_shakespeare[:5]) 

#----------------------------------#

# Conclusion

"""
Nice job! You have successfully preprocessed the sentences and prepared them for encoding. Now you have a clean and transformed dataset to work with for the next step.
"""

['\ufeffthe project gutenberg ebook complet work william shakespear , william shakespear ebook use anyon anywher unit state part world cost almost restrict whatsoev', 'may copi , give away re-us term project gutenberg licens includ ebook onlin www', 'gutenberg', 'org', 'locat unit state , check law countri locat use ebook']


'\nNice job! You have successfully preprocessed the sentences and prepared them for encoding. Now you have a clean and transformed dataset to work with for the next step.\n'

In [29]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

In [30]:
# exercise 07

"""
Shakespearean language encoder

With the preprocessed Shakespearean text at your fingertips, you now need to encode it into a numerical representation. You will need to define the encoding steps before putting the pipeline together. To better handle large amounts of data and efficiently perform the encoding, you will use PyTorch's Dataset and DataLoader for batching and shuffling the data.

The following has been loaded for you: torch, nltk, stopwords, PorterStemmer, get_tokenizer, CountVectorizer, Dataset, and DataLoader.

The processed_shakespeare from the Shakespearean text is also available to you.
"""

# Instructions

"""

    Define a ShakespeareDataset dataset class and complete the __init__ and __getitem__ methods.
---

    Complete the encode_sentences() function to take in a list of sentences and encode them using the bag-of-words technique from sklearn.
---

    Complete and call the text_processing_pipeline() function by using preprocess_sentences(), encode_sentences(), ShakespeareDataset class, and DataLoader.

    Print the first ten feature names with the get_feature_names_out() method and components of the first item of dataloader.

"""

# solution

# Define your Dataset class
class ShakespeareDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

# Complete the encoding function
def encode_sentences(sentences):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(sentences)
    return X.toarray(), vectorizer
    
# Complete the text processing pipeline
def text_processing_pipeline(sentences):
    processed_sentences = preprocess_sentences(sentences)
    encoded_sentences, vectorizer = encode_sentences(processed_sentences)
    dataset = ShakespeareDataset(encoded_sentences)
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
    return dataloader, vectorizer

dataloader, vectorizer = text_processing_pipeline(processed_shakespeare)

# Print the vectorizer's feature names and the first 10 components of the first item
print(vectorizer.get_feature_names_out()[:10]) 
print(next(iter(dataloader))[0] [:10])

#----------------------------------#

# Conclusion

"""
Congratulations! You have successfully encoded the Shakespearean text data, and made it useful for your publishing company. The first ten feature representations of the first sentence in your batched data provides a numerical representation, enabling analysis and modeling of the Shakespearean language.
"""

['000' '10' '100' '1000' '1004' '1009' '101' '1012' '1016' '102']
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


'\nCongratulations! You have successfully encoded the Shakespearean text data, and made it useful for your publishing company. The first ten feature representations of the first sentence in your batched data provides a numerical representation, enabling analysis and modeling of the Shakespearean language.\n'