In [37]:
import os

# Preparacion de los Corpus
## Eliminacion de las cabeceras

In [38]:
CORPUS_MODIFICATION_DONE = True

In [39]:
path = os.getcwd() + "/Corpus-representacion"
# Obtener el listado
listado = os.listdir(path)

corpus_dir = [] # Guardamos el directorio para cada fichero
for elemento in listado:
    corpus_dir.append(path +'/'+elemento)

In [40]:
def read_file_without_header(filename):
    with open(filename, "r") as archivo:
        lineas = archivo.readlines()
        start_read_file = False
        text_without_header = ""
        for linea in lineas:
            if linea.startswith("Lines:"):
                start_read_file = True
                continue
            if start_read_file:
                text_without_header += linea.strip()
        return text_without_header

def write_file(filename, text):
    """
    Escribe texto en un fichero, sobrescribiendo su contenido existente.
    Si el fichero no existe, lo crea.

    Args:
        filename (str): Nombre del fichero.
        text (str): Texto a escribir en el fichero.
    """
    try:
        with open(filename, 'w') as file:  # 'w' mode sobrescribe el contenido
            file.write(text)  # Escribe el texto proporcionado
        print(f"Texto sobrescrito correctamente en {filename}.")
    except Exception as e:
        print(f"Error al escribir en el fichero: {e}")


In [41]:
all_corpus_files = []
for dir in corpus_dir:
    corpus_files = os.listdir(dir)
    for corpus in corpus_files:
        all_corpus_files.append(dir +'/'+corpus)

In [42]:
if not CORPUS_MODIFICATION_DONE:
    for corpus in all_corpus_files:
        text_without_header = read_file_without_header(dir +'/'+corpus)
        write_file(dir +'/'+corpus, text_without_header)

Hay un único archivo que parecía estar mal configurado, identificado como 101596 de rec.autos. En la fotografía se observa que incluso el propio editor de código no logra identificar el error exacto, por lo que los caracteres especiales se han eliminado manualmente.


![Descripción de la imagen](./images/image.png)

# Tokenizacion

In [59]:
import re
import nltk
from nltk.tokenize import word_tokenize

words = []
for corpus in all_corpus_files:
    f = open(corpus)
    raw = f.read()
    # make all words to lowercase and remove specials characters
    raw = re.sub(r'[^\w\s]', '', raw.lower())

    all_file = word_tokenize(raw)
    for word in all_file:
        words.append(word)

print(len(words))
print(words[0:5])

193864
['messageid', 'flax93apr6125933frejteknikumuusereferences', '1993mar30142700543vmshujiacil', 'flax93apr3142133frejteknikumuuseflax93apr5224449frejteknikumuuse1993apr522175928472thundermcrcimmcgilledunntppostinghost', 'frejteknikumuuseinreplyto']


In [53]:
# remove all words that contain a number
bag_of_words = [word for word in words if not re.search(r'\d', word)]
bag_of_words[0:5]

['messageid',
 'frejteknikumuuseinreplyto',
 'hasanmcrcimmcgilledu',
 's',
 'message']

In [54]:
# Filtrar palabras según la longitud
bag_of_words = [word for word in bag_of_words if 2 <= len(word) <= 15]
bag_of_words[0:5]

['messageid', 'message', 'of', 'mon', 'apr']

In [57]:
from nltk.corpus import words as nltk_words
nltk.download('words')

valid_words = set(nltk_words.words())
bag_of_words = [word for word in bag_of_words if word in valid_words]
bag_of_words[0:5]

[nltk_data] Downloading package words to
[nltk_data]     /Users/maciasalvasalva/nltk_data...
[nltk_data]   Package words is already up-to-date!


['message', 'of', 'mon', 'article', 'stuff']

In [58]:
len(bag_of_words)

133820

# Stop-Words

In [24]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
# numeros, caracteres especiales, palabras que no existen, etc..

filtered_bag_of_words = []
for word in bag_of_words:
    if word not in stop_words:
        filtered_bag_of_words.append(word)

print(filtered_bag_of_words[10:50])
print(len(filtered_bag_of_words))

['93', '221759', 'gmtin', 'article', '1993apr522175928472thundermcrcimmcgilledu', 'hasanmcrcimmcgilledu', 'writes', 'stuff', 'deleted', 'wrote', 'calling', 'names', 'giving', 'title', 'first', 'read', 'paragraph', 'accept', 'title', 'order', 'let', 'get', 'um', 'well', 'debate', 'againhasan', 'repliesi', 'didnot', 'know', 'master', 'wisdom', 'name', 'clling', 'toounless', 'consider', 'deserveless', 'unless', 'referring', 'someone', 'else']
114306


# Lematizacion

In [36]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('pos_tag')

# Función para mapear tipos gramaticales
def get_wordnet_pos(word):
    from nltk.corpus import wordnet
    from nltk import pos_tag
    
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Inicializamos el lematizador
lemmatizer = WordNetLemmatizer()

# Aplicamos la lematización considerando el tipo gramatical
final_bag_of_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in filtered_bag_of_words]

print(final_bag_of_words)


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/maciasalvasalva/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/maciasalvasalva/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Error loading pos_tag: Package 'pos_tag' not found in
[nltk_data]     index


