In [None]:
from __future__ import annotations
import sys; sys.path.insert(0, '..')

%load_ext autoreload
%autoreload 2

# python
import os
import ssl
import csv

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import session_info

from pathlib import Path
from inspect import cleandoc

# utils
from utils import Constants

# text
import re
import spacy
import unidecode

# stat
from scipy import stats
from nltk.stem.porter import PorterStemmer

# statsmodels
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# typings
from pandas import DataFrame as PandasDF
from typing import List, Dict, Union

# setup
plt.style.use('seaborn-v0_8')
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('max_colwidth', None)
# decimals
np.set_printoptions(precision=6)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
# cargar el dataset sklearn
if not os.environ.get('CI'):
    ssl._create_default_https_context =\
        ssl._create_unverified_context
          
# rutas absolutas
here: Path = Path.cwd().absolute().parent
data: Path = here / 'data'
poetry_fundation_data: Path = data / 'PoetryFoundationData.csv'
poetry_fundation_cleaned: Path = data / 'CleanedPoetryFoundationData.csv'

# nlp load
# python -m spacy download en_core_web_sm
# python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")
ps = PorterStemmer()

In [34]:
if not poetry_fundation_data.is_file():
    raise FileNotFoundError(
        cleandoc(f'''
        El archivo {poetry_fundation_data} no existe.
        Por favor, descargue el archivo desde:
        https://www.kaggle.com/datasets/abhinavwalia95/poetryfoundationorg
        y coloquelo en la carpeta data.
        ''')
    )
    
setup:Dict = dict(sep=Constants.SEP, encoding=Constants.ENCODING)

poetry_df: PandasDF = (
    pd.read_csv(
        poetry_fundation_data,
        usecols=lambda col: 'unnamed' not in col.lower(),
        **setup
    )
    .dropna()
    .reset_index(drop=True)
)

poetry_df.columns = (
    poetry_df.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
)

print(poetry_df.info())
display(poetry_df.head(2))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12899 entries, 0 to 12898
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   12899 non-null  object
 1   poem    12899 non-null  object
 2   poet    12899 non-null  object
 3   tags    12899 non-null  object
dtypes: object(4)
memory usage: 403.2+ KB
None


Unnamed: 0,title,poem,poet,tags
0,\r\r\n Invisible Fish\r\r\n,"\r\r\nInvisible fish swim this ghost ocean now described by waves of sand, by water-worn rock. Soon the fish will learn to walk. Then humans will come ashore and paint dreams on the dying stone. Then later, much later, the ocean floor will be punctuated by Chevy trucks, carrying the dreamers’ decendants, who are going to the store.\r\r\n",Joy Harjo,"Living,Time & Brevity,Relationships,Family & Ancestors,Nature,Landscapes & Pastorals,Seas, Rivers, & Streams,Social Commentaries,History & Politics"
1,\r\r\n Don’t Bother the Earth Spirit\r\r\n,"\r\r\nDon’t bother the earth spirit who lives here. She is working on a story. It is the oldest story in the world and it is delicate, changing. If she sees you watching she will invite you in for coffee, give you warm bread, and you will be obligated to stay and listen. But this is no ordinary story. You will have to endure earthquakes, lightning, the deaths of all those you love, the most blinding beauty. It’s a story so compelling you may never want to leave; this is how she traps you. See that stone finger over there? That is the only one who ever escaped.\r\r\n",Joy Harjo,"Religion,The Spiritual,Mythology & Folklore,Fairy-tales & Legends"


In [35]:
def clean(text:str)->str:
    """Limpia el texto de caracteres especiales y espacios extras."""
    text = unidecode.unidecode(text)
    text = re.sub(r'http\S+', Constants.EMPTY_STR, text)  # remove URLs
    text = re.sub(r'www\S+', Constants.EMPTY_STR, text)   # remove URLs
    text = re.sub(r'[^A-Za-z0-9\s]', Constants.EMPTY_STR, text)  # remove special chars
    text = re.sub(r'[\[\]\"]', Constants.EMPTY_STR, text)  # remove brackets and quotes
    text = re.sub(r'\d+', Constants.EMPTY_STR, text)  # remove digits
    text = re.sub(r'\s+', Constants.SPACE_STR, text).strip()  # remove extra spaces
    return text.lower()

def remove_stopwords(text:str)->str:
    """Remueve las stopwords del texto."""
    doc = nlp(text)
    tokens = [
        ps.stem(token.text)
        for token in doc
        if not token.is_stop 
        and token.is_alpha 
        and not token.is_punct
    ]
    return Constants.SPACE_STR.join(tokens)

def lematize(text:str)->str:
    """Lematiza el texto."""
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc]
    return Constants.SPACE_STR.join(tokens)

def preprocess(text:str)->str:
    """Preprocesa el texto."""
    text = clean(text)
    text = remove_stopwords(text)
    text = lematize(text)
    return text

In [36]:
def preprocess_tags(tags_column: pd.Series) -> pd.Series:
    """Procesa la columna completa de etiquetas."""
    list_of_lists = tags_column.apply(
        lambda x: [clean(tag) for tag in x.split(Constants.COMMA_STR)]
    )
    
    # 2. Opcional: Eliminar cualquier cadena vacía resultante de la limpieza
    list_of_lists = list_of_lists.apply(
        lambda tags: [tag for tag in tags if tag]
    )
    
    return list_of_lists

In [37]:
setup_load:Dict = dict(
    sep=Constants.PIPE_STR,
    quotechar='"',
    quoting=csv.QUOTE_NONNUMERIC,
    encoding=Constants.ENCODING
)

if not poetry_fundation_cleaned.is_file():
    object_cols = poetry_df.select_dtypes(include=object).columns
    cols_to_process = object_cols[:-1]
    
    poetry_df[cols_to_process] = (
        poetry_df[cols_to_process]
        .apply(lambda col: col.astype(str).apply(preprocess))
    )
    
    poetry_df['tags'] = preprocess_tags(poetry_df['tags'])
    
    # 3. FILTRADO O MANEJO DE NULOS
    # La solución más común es eliminar filas donde la columna principal (ej. 'poem') es nula
    poetry_df = poetry_df.dropna(subset=['poem']).reset_index(drop=True)
    
else:
    poetry_df: PandasDF = (
        pd.read_csv(
            poetry_fundation_cleaned,
            **setup_load
        )
    )

In [38]:
display(poetry_df.head(2))

Unnamed: 0,title,poem,poet,tags
0,invis fish,invis fish swim ghost ocean describ wave sand waterworn rock soon fish learn walk human come ashor paint dream die stone later later ocean floor punctuat chevi truck carri dreamer decend go store,joy harjo,"[living, time brevity, relationships, family ancestors, nature, landscapes pastorals, seas, rivers, streams, social commentaries, history politics]"
1,not bother earth spirit,not bother earth spirit live work stori oldest stori world delic chang see watch invit coffe warm bread oblig stay listen ordinari stori endur earthquak lightn death love blind beauti stori compel want leav trap stone finger escap,joy harjo,"[religion, the spiritual, mythology folklore, fairytales legends]"


In [39]:
if not poetry_fundation_cleaned.is_file():
    poetry_df.to_csv(
        str(poetry_fundation_cleaned), 
        index=False,
        **setup_load
    )