In [None]:
import nltk
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Download necessary resources
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Load the text corpus
emma_text = gutenberg.raw('austen-emma.txt')

In [None]:
# Sentence Tokenization
sentences = sent_tokenize(emma_text)
print("First 5 sentences:\n", sentences[:5])

# Word Tokenization
words = word_tokenize(emma_text)
print("\nFirst 20 words:\n", words[:20])


First 5 sentences:
 ['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.', "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.", 'Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.', "Sixteen years had Miss Taylor been in Mr. Woodhouse's family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.", 'Between _them_ it was more the intimacy\nof sisters.']

First 20 words:
 ['[', 'Emma', 'by', 'J

In [None]:
# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Perform stemming
stemmed_words = [stemmer.stem(word) for word in words]
print("\nFirst 20 stemmed words:\n", stemmed_words[:20])



First 20 stemmed words:
 ['[', 'emma', 'by', 'jane', 'austen', '1816', ']', 'volum', 'i', 'chapter', 'i', 'emma', 'woodhous', ',', 'handsom', ',', 'clever', ',', 'and', 'rich']


In [None]:
# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Perform lemmatization
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print("\nFirst 20 lemmatized words:\n", lemmatized_words[:20])



First 20 lemmatized words:
 ['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER', 'I', 'Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich']


In [None]:
# Load stop words
stop_words = set(stopwords.words('english'))

# Remove stop words
filtered_words = [word for word in words if word.lower() not in stop_words]
print("\nFirst 20 words after stop word removal:\n", filtered_words[:20])



First 20 words after stop word removal:
 ['[', 'Emma', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'CHAPTER', 'Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'rich', ',', 'comfortable', 'home', 'happy']


# Part 3

In [None]:
import nltk
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

# Download necessary resources
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load the text corpus
emma_text = gutenberg.raw('austen-emma.txt')

# Sentence Tokenization
sentences = sent_tokenize(emma_text)

# Word Tokenization, Lemmatization, and Stop Word Removal
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word.isalnum()]
    filtered_words = [word for word in lemmatized_words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Preprocess each sentence
preprocessed_sentences = [preprocess(sentence) for sentence in sentences]

# Bag-of-Words (BoW) Feature Extraction
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(preprocessed_sentences)
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print("Bag-of-Words (BoW) Representation:")
print(bow_df.head())

# TF-IDF Feature Extraction
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_sentences)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Representation:")
print(tfidf_df.head())


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Bag-of-Words (BoW) Representation:
   1816  23rd  24th  26th  28th  7th  8th  abbey  abbots  abdy  ...  yielding  \
0     1     0     0     0     0    0    0      0       0     0  ...         0   
1     0     0     0     0     0    0    0      0       0     0  ...         0   
2     0     0     0     0     0    0    0      0       0     0  ...         0   
3     0     0     0     0     0    0    0      0       0     0  ...         0   
4     0     0     0     0     0    0    0      0       0     0  ...         0   

   york  yorkshire  young  younger  youngest  youth  youthful  zeal  zigzag  
0     0          0      0        0         0      0         0     0       0  
1     0          0      0        0         1      0         0     0       0  
2     0          0      0        0         0      0         0     0       0  
3     0          0      0        0         0      0         0     0       0  
4     0          0      0        0         0      0         0     0       0  

[5 rows x