# Marcelo Reis Esteves - AS01
---

In [None]:
import requests

# Carregando conteúdo Shakespeare.txt
url = "https://raw.githubusercontent.com/MarceloReisxz/PUC-Minas/refs/heads/main/Text%20Mining%20and%20Analysis/AS01/Shakespeare.txt"

response = requests.get(url)
text = response.text

## Normalização
---

In [None]:
# Lower case reduction
texto_normalizado = text.lower()

# ----------------------------------------

# Accent and diacritic removal
import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

texto_normalizado = remove_accents(texto_normalizado)

# ----------------------------------------

# Canonicalizing of acronyms, currency, date and hyphenated words
import re

# acronyms
texto_normalizado = re.sub(r'\.(?!([^.\s]|\d))', '', texto_normalizado)

# currency
text = re.sub(r'(\$|€|£|r\$)\s*(\d+[\.,]?\d*)', r'\1\2', text)

# date
text = re.sub(r'(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{2,4})', r'\1-\2-\3', text)

# hyphenated words
text = re.sub(r'(?<=\b\w)-(?=\w\b)', '', text)

# ----------------------------------------

# Punctuation removal (except currency and date)
texto_normalizado = re.sub(r'(?<!\d)[.,:;!?\'\"()\-=](?!\d)', '', texto_normalizado)

# ----------------------------------------

# Special characters removal
texto_normalizado = re.sub(r'(?<!\d)[.,:;!?\'\"()\-=](?!\d)', '', texto_normalizado)
texto_normalizado = re.sub(r' +', ' ', texto_normalizado)
texto_normalizado
# ----------------------------------------

with open("Shakespeare_Normalized.txt", "w", encoding="utf-8") as arquivo:
    arquivo.write(texto_normalizado)


# Tokenização
---

White Space Tokenization

In [None]:
texto_white_space = texto_normalizado.split()
with open("Shakespeare_Normalized_Tokenized01.txt", "w", encoding="utf-8") as arquivo:
    for token in texto_white_space:
        arquivo.write(token + "\n")

NLTK: Word Tokenizer

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

import nltk.tokenize as to

tokeized_text = to.word_tokenize(texto_normalizado)
with open("Shakespeare_Normalized_Tokenized02.txt", "w", encoding="utf-8") as arquivo:
    for token in tokeized_text:
        arquivo.write(token + "\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


NLTK: Tree Bank Tokenizer

In [None]:
tokenized_text = to.TreebankWordTokenizer().tokenize(texto_normalizado)
with open("Shakespeare_Normalized_Tokenized03.txt", "w", encoding="utf-8") as arquivo:
    for token in tokenized_text:
        arquivo.write(token + "\n")

NLTK: Word Punctuation Tokenizer

In [None]:
tokenized_text = to.WordPunctTokenizer().tokenize(texto_normalizado)
with open("Shakespeare_Normalized_Tokenized04.txt", "w", encoding="utf-8") as arquivo:
    for token in tokenized_text:
        arquivo.write(token + "\n")

NLTK: Tweet Tokenizer

In [None]:
tokenized_text = to.TweetTokenizer().tokenize(texto_normalizado)
with open("Shakespeare_Normalized_Tokenized05.txt", "w", encoding="utf-8") as arquivo:
    for token in tokenized_text:
        arquivo.write(token + "\n")

NLTK: MWE Tokenizer

In [None]:
tokenized_text = to.MWETokenizer().tokenize(texto_normalizado)
with open("Shakespeare_Normalized_Tokenized06.txt", "w", encoding="utf-8") as arquivo:
    for token in tokenized_text:
        arquivo.write(token + "\n")

TextBlob Word Tokenizer

In [None]:
!pip install textblob
from textblob import TextBlob

blob_object = TextBlob(texto_normalizado)
tokenized_text = blob_object.words
with open("Shakespeare_Normalized_Tokenized07.txt", "w", encoding="utf-8") as arquivo:
    for token in tokenized_text:
        arquivo.write(token + "\n")




spaCy Tokenizer

In [None]:
!pip install spacy
import spacy

nlp = spacy.blank("en")
tokenized_text = nlp(texto_normalizado)

with open("Shakespeare_Normalized_Tokenized08.txt", "w", encoding="utf-8") as arquivo:
    for token in tokenized_text:
        arquivo.write(str(token) + "\n")



Gensim Word Tokenizer

In [None]:
!pip install gensim
from gensim.utils import tokenize

tokenized_text = tokenize(texto_normalizado)

with open("Shakespeare_Normalized_Tokenized09.txt", "w", encoding="utf-8") as arquivo:
    for token in tokenized_text:
        arquivo.write(token + "\n")




Keras Tokenization

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import text_to_word_sequence

tokenized_text = text_to_word_sequence(texto_normalizado)

with open("Shakespeare_Normalized_Tokenized10.txt", "w", encoding="utf-8") as arquivo:
    for token in tokenized_text:
        arquivo.write(token + "\n")

## Stop-words
---

In [None]:
arquivo_path = "Shakespeare_Normalized_Tokenized02.txt"
tokenized_text = []

with open(arquivo_path, "r", encoding="utf-8") as arquivo:
    tokenized_text = arquivo.readlines()

tokenized_text = [linha.strip() for linha in tokenized_text]

# ----------------------------------------

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

sw = stopwords.words('english')
nonstopwords_text = [word for word in tokenized_text if not word in sw]

# ----------------------------------------

with open("Shakespeare_Normalized_Tokenized_StopWord.txt", "w", encoding="utf-8") as arquivo:
    for token in nonstopwords_text:
        arquivo.write(token + "\n")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Lematização
---

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
lemmatized_text = [lemmatizer.lemmatize(word) for word in nonstopwords_text]

with open("Shakespeare_Normalized_Tokenized_StopWord_Lemmatized.txt", "w", encoding="utf-8") as arquivo:
    for token in lemmatized_text:
        arquivo.write(token + "\n")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Stemming
---

Porter Stemmer

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
tokens = [stemmer.stem(token) for token in lemmatized_text]

with open("Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming01.txt", "w", encoding="utf-8") as arquivo:
    for token in tokens:
        arquivo.write(token + "\n")

Snowball Stemmer

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
tokens = [stemmer.stem(token) for token in lemmatized_text]

with open("Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming02.txt", "w", encoding="utf-8") as arquivo:
    for token in tokens:
        arquivo.write(token + "\n")

# CSV Lematizador
---

In [None]:
import csv
from collections import Counter

input_file = "Shakespeare_Normalized_Tokenized_StopWord_Lemmatized.txt"
output_file = "Shakespeare_Vocabulary_Lemmatized.csv"

with open(input_file, "r", encoding="utf-8") as file:
    tokens = [line.strip() for line in file if line.strip()]

# Contar a frequência de cada token
token_counts = Counter(tokens)

data = [(token, count, len(token)) for token, count in token_counts.items()]

with open(output_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Token", "Número de Ocorrências", "Tamanho (Caracteres)"])
    writer.writerows(data)

print(f"Arquivo CSV '{output_file}' gerado com sucesso!")


Arquivo CSV 'Shakespeare_Vocabulary_Lemmatized.csv' gerado com sucesso!


# CSV Porter Stemmer
---

In [None]:
input_file = "Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming01.txt"
output_file = "Shakespeare_Vocabulary_Porter.csv"

with open(input_file, "r", encoding="utf-8") as file:
    tokens = [line.strip() for line in file if line.strip()]

# Contar a frequência de cada token
token_counts = Counter(tokens)

data = [(token, count, len(token)) for token, count in token_counts.items()]

with open(output_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Token", "Número de Ocorrências", "Tamanho (Caracteres)"])
    writer.writerows(data)

print(f"Arquivo CSV '{output_file}' gerado com sucesso!")


Arquivo CSV 'Shakespeare_Vocabulary_Porter.csv' gerado com sucesso!


# CSV Snowball Stemmer
---

In [None]:
input_file = "Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming02.txt"
output_file = "Shakespeare_Vocabulary_Snow.csv"

with open(input_file, "r", encoding="utf-8") as file:
    tokens = [line.strip() for line in file if line.strip()]

# Contar a frequência de cada token
token_counts = Counter(tokens)

data = [(token, count, len(token)) for token, count in token_counts.items()]

with open(output_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Token", "Número de Ocorrências", "Tamanho (Caracteres)"])
    writer.writerows(data)

print(f"Arquivo CSV '{output_file}' gerado com sucesso!")


Arquivo CSV 'Shakespeare_Vocabulary_Snow.csv' gerado com sucesso!


# Txt Comparativo
---

In [None]:
from collections import Counter

# Lista de arquivos de entrada e seus respectivos títulos
files = [
    ("Shakespeare_Normalized_Tokenized_StopWord_Lemmatized.txt", "LEMATIZADOR (WordNet Lemmatizer)"),
    ("Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming01.txt", "STEMMING (Porter Stemmer)"),
    ("Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming02.txt", "STEMMING (Snowball Stemmer)")
]

output_txt = "Shakespeare_Vocabulary_Analysis.txt"

with open(output_txt, "w", encoding="utf-8") as file:
    file.write("")

for input_file, title in files:
    with open(input_file, "r", encoding="utf-8") as file:
        tokens = [line.strip() for line in file if line.strip()]

    # Contar a frequência de cada token
    token_counts = Counter(tokens)

    total_tokens = len(token_counts)
    avg_occurrences = sum(token_counts.values()) / total_tokens
    avg_length = sum(len(token) for token in token_counts) / total_tokens

    with open(output_txt, "a", encoding="utf-8") as file:
        file.write(f"---- {title} ------\n")
        file.write(f"Tamanho do vocabulário: {total_tokens}\n")
        file.write(f"Número médio de ocorrências: {avg_occurrences:.2f}\n")
        file.write(f"Tamanho médio dos tokens: {avg_length:.2f}\n")
        file.write("\n")

print(f"Arquivo '{output_txt}' gerado com sucesso!")


Arquivo 'Shakespeare_Vocabulary_Analysis.txt' gerado com sucesso!
