In [1]:
import pandas as pd
import re
import os

In [2]:
import nltk

In [None]:
from nltk.corpus import stopwords

In [3]:
!pip install nltk



In [4]:
# Descargar el conjunto de stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [10]:
# Seleccionar el idioma (por ejemplo, español o inglés)
stop_words = set(stopwords.words('english'))

In [11]:
#Path a los archivos
training_path = "../data/reuters/training"
test_path = "../data/reuters/test"

In [12]:
#Función para recorrer los directorios
def load_files_from_directory(directory_path):
    files_content = []
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='latin-1') as file:
            content = file.read()
            files_content.append(content)
    return files_content

In [20]:
#Cargar archivos
training_data = []
test_data = []

training_data = load_files_from_directory(training_path)
test_data = load_files_from_directory(test_path)

In [25]:
def clean_text(text):
    # Convertir a minúsculas
    text = text.lower()
    
    # Eliminar caracteres especiales y signos de puntuación
    text = re.sub(r'[,;.:¡!¿?@#$%&[\](){}<>~=+\-*/|\\_^`"\']', ' ', text)
    
    # Eliminar números
    text = re.sub(r'\d+', ' ', text)
    
    # Eliminar espacios adicionales
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


In [26]:

# Aplicar la función de limpieza a cada documento en training_data
cleaned_training_data = [clean_text(doc) for doc in training_data]
cleaned_test_data = [clean_text(doc) for doc in test_data]


In [27]:
cleaned_training_data

['bahia cocoa review showers continued throughout the week in the bahia cocoa zone alleviating the drought since early january and improving prospects for the coming temporao although normal humidity levels have not been restored comissaria smith said in its weekly review the dry period means the temporao will be late this year arrivals for the week ended february were bags of kilos making a cumulative total for the season of mln against at the same stage last year again it seems that cocoa delivered earlier on consignment was included in the arrivals figures comissaria smith said there is still some doubt as to how much old crop cocoa is still available as harvesting has practically come to an end with total bahia crop estimates around mln bags and sales standing at almost mln there are a few hundred thousand bags still in the hands of farmers middlemen exporters and processors there are doubts as to how much of this cocoa would be fit for export as shippers are now experiencing dific

In [32]:

from nltk.tokenize import word_tokenize

nltk.download('punkt')

def tokenize_text(text):
    # Tokenizar el texto
    tokens = word_tokenize(text)
    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [33]:
# Aplicar la tokenización a cada documento en cleaned_training_data
tokenized_training_data = [tokenize_text(doc) for doc in cleaned_training_data]
tokenized_test_data = [tokenize_text(doc) for doc in cleaned_test_data]

In [37]:
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    # Eliminar palabras vacías
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
filtered_training_data = [remove_stopwords(doc) for doc in tokenized_training_data]
filtered_test_data = [remove_stopwords(doc) for doc in tokenized_test_data]

In [41]:
filtered_training_data[1]

['computer',
 'terminal',
 'systems',
 'lt',
 'cpml',
 'completes',
 'sale',
 'computer',
 'terminal',
 'systems',
 'inc',
 'said',
 'completed',
 'sale',
 'shares',
 'common',
 'stock',
 'warrants',
 'acquire',
 'additional',
 'one',
 'mln',
 'shares',
 'lt',
 'sedio',
 'n',
 'v',
 'lugano',
 'switzerland',
 'dlrs',
 'company',
 'said',
 'warrants',
 'exercisable',
 'five',
 'years',
 'purchase',
 'price',
 'dlrs',
 'per',
 'share',
 'computer',
 'terminal',
 'said',
 'sedio',
 'also',
 'right',
 'buy',
 'additional',
 'shares',
 'increase',
 'total',
 'holdings',
 'pct',
 'computer',
 'terminal',
 'outstanding',
 'common',
 'stock',
 'certain',
 'circumstances',
 'involving',
 'change',
 'control',
 'company',
 'company',
 'said',
 'conditions',
 'occur',
 'warrants',
 'would',
 'exercisable',
 'price',
 'equal',
 'pct',
 'common',
 'stock',
 'market',
 'price',
 'time',
 'exceed',
 'dlrs',
 'per',
 'share',
 'computer',
 'terminal',
 'also',
 'said',
 'sold',
 'technolgy',
 'rights'

In [46]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

# Descargar el paquete wordnet para lematización
nltk.download('wordnet')
nltk.download('omw-1.4')

# Inicializar el stemmer y lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Aplicar stemming a cada documento
stemmed_training_data = [[stemmer.stem(word) for word in doc] for doc in filtered_training_data]
stemmed_test_data = [[stemmer.stem(word) for word in doc] for doc in filtered_test_data]

# Aplicar lematización a cada documento
lemmatized_training_data = [[lemmatizer.lemmatize(word) for word in doc] for doc in filtered_training_data]
lemmatized_test_data = [[lemmatizer.lemmatize(word) for word in doc] for doc in filtered_test_data]


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...


In [48]:
stemmed_training_data[0]

['bahia',
 'cocoa',
 'review',
 'shower',
 'continu',
 'throughout',
 'week',
 'bahia',
 'cocoa',
 'zone',
 'allevi',
 'drought',
 'sinc',
 'earli',
 'januari',
 'improv',
 'prospect',
 'come',
 'temporao',
 'although',
 'normal',
 'humid',
 'level',
 'restor',
 'comissaria',
 'smith',
 'said',
 'weekli',
 'review',
 'dri',
 'period',
 'mean',
 'temporao',
 'late',
 'year',
 'arriv',
 'week',
 'end',
 'februari',
 'bag',
 'kilo',
 'make',
 'cumul',
 'total',
 'season',
 'mln',
 'stage',
 'last',
 'year',
 'seem',
 'cocoa',
 'deliv',
 'earlier',
 'consign',
 'includ',
 'arriv',
 'figur',
 'comissaria',
 'smith',
 'said',
 'still',
 'doubt',
 'much',
 'old',
 'crop',
 'cocoa',
 'still',
 'avail',
 'harvest',
 'practic',
 'come',
 'end',
 'total',
 'bahia',
 'crop',
 'estim',
 'around',
 'mln',
 'bag',
 'sale',
 'stand',
 'almost',
 'mln',
 'hundr',
 'thousand',
 'bag',
 'still',
 'hand',
 'farmer',
 'middlemen',
 'export',
 'processor',
 'doubt',
 'much',
 'cocoa',
 'would',
 'fit',
 'ex