In [1]:
from __future__ import annotations

%load_ext autoreload
%autoreload 2

# python
import os
import ssl

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import session_info

from pathlib import Path
from inspect import cleandoc
from dataclasses import dataclass
from itertools import combinations
from wordcloud import WordCloud

# text
import re
import spacy
import unidecode

# stat
from scipy import stats

# statsmodels
from statsmodels.stats.multicomp import pairwise_tukeyhsd


# typings
from pandas import DataFrame as PandasDF
from typing import List, Dict, Union

# setup
plt.style.use('seaborn-v0_8')
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('max_colwidth', None)
# decimals
np.set_printoptions(precision=6)

In [6]:
# cargar el dataset sklearn
if not os.environ.get('CI'):
    ssl._create_default_https_context =\
        ssl._create_unverified_context
          
# rutas absolutas
here: Path = Path.cwd().absolute().parent
data: Path = here / 'data'
poetry_fundation_data: Path = data / 'PoetryFoundationData.csv'

# nlp load
# python -m spacy download en_core_web_sm
# python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")

In [3]:
@dataclass(frozen=True)
class Constant:
    SEP:str = ','
    ENCODING:str = 'utf-8'
    EMPTY_STR:str = ''
    SPACE_STR:str = ' '
    ONE:int = 1
    ZERO:int = 0

In [None]:
if not poetry_fundation_data.is_file():
    raise FileNotFoundError(
        cleandoc(f'''
        El archivo {poetry_fundation_data} no existe.
        Por favor, descargue el archivo desde:
        https://www.kaggle.com/datasets/abhinavwalia95/poetryfoundationorg
        y coloquelo en la carpeta data.
        ''')
    )
    
setup:Dict = dict(sep=Constant.SEP, encoding=Constant.ENCODING)

poetry_df: PandasDF = pd.read_csv(poetry_fundation_data, **setup)
poetry_df.columns = (
    poetry_df.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
)

print(poetry_df.info())
display(poetry_df.head(3))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13854 entries, 0 to 13853
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   unnamed:_0  13854 non-null  int64 
 1   title       13854 non-null  object
 2   poem        13854 non-null  object
 3   poet        13854 non-null  object
 4   tags        12899 non-null  object
dtypes: int64(1), object(4)
memory usage: 541.3+ KB
None


Unnamed: 0,unnamed:_0,title,poem,poet,tags
0,0,\r\r\n Objects Used to Prop Open a Window\r\r\n,"\r\r\nDog bone, stapler,\r\r\ncribbage board, garlic press\r\r\n because this window is loose—lacks\r\r\nsuction, lacks grip.\r\r\nBungee cord, bootstrap,\r\r\ndog leash, leather belt\r\r\n because this window had sash cords.\r\r\nThey frayed. They broke.\r\r\nFeather duster, thatch of straw, empty\r\r\nbottle of Elmer's glue\r\r\n because this window is loud—its hinges clack\r\r\nopen, clack shut.\r\r\nStuffed bear, baby blanket,\r\r\nsingle crib newel\r\r\n because this window is split. It's dividing\r\r\nin two.\r\r\nVelvet moss, sagebrush,\r\r\nwillow branch, robin's wing\r\r\n because this window, it's pane-less. It's only\r\r\na frame of air.\r\r\n",Michelle Menting,
