In [1]:
from core.goodreads import GoodReadsData
import gc

goodreads = GoodReadsData()
filename = goodreads.file_names[4]
filename

'goodreads_book_genres_initial'

In [2]:
gc.collect()

0

# Análisis de `goodreads_book_genres_initial`

In [3]:
import os
import numpy as np

In [4]:
# Download
if not os.path.exists(goodreads.get_file_path(filename)):
    goodreads.download_file(filename)
    
# Load
df = goodreads.load_file(filename)

### Análisis
1. Comprensión de los datos.
2. Detección de errores.
3. Ver duplicados.

In [5]:
df.sample(5)

Unnamed: 0,book_id,genres
1676998,32487673,"{'history, historical fiction, biography': 179..."
421284,24952882,"{'fantasy, paranormal': 3}"
521700,18705105,"{'fiction': 137, 'history, historical fiction,..."
1897479,30113756,"{'romance': 11, 'fantasy, paranormal': 3, 'fic..."
1205495,18246192,"{'fiction': 4, 'fantasy, paranormal': 3}"


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2360655 entries, 0 to 2360654
Data columns (total 2 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   book_id  int64 
 1   genres   object
dtypes: int64(1), object(1)
memory usage: 36.0+ MB


In [7]:
df.isna().sum()

book_id    0
genres     0
dtype: int64

In [8]:
df["genres"].apply(lambda x: len(x) == 0).sum()

np.int64(409513)

In [9]:
df.loc[
    df["genres"].apply(lambda x: len(x)).sort_values(ascending=False).index[:5],
    "genres"
]

2211875    {'poetry': 35, 'romance': 20, 'fiction': 31, '...
815504     {'poetry': 35, 'romance': 20, 'fiction': 31, '...
1414828    {'poetry': 35, 'romance': 20, 'fiction': 31, '...
2887       {'poetry': 35, 'romance': 20, 'fiction': 31, '...
797604     {'poetry': 35, 'romance': 20, 'fiction': 31, '...
Name: genres, dtype: object

In [10]:
df.loc[2211875, "genres"]

{'poetry': 35,
 'romance': 20,
 'fiction': 31,
 'history, historical fiction, biography': 22,
 'children': 24,
 'young-adult': 4,
 'fantasy, paranormal': 2,
 'comics, graphic': 2,
 'mystery, thriller, crime': 2,
 'non-fiction': 1}

In [11]:
df.loc[2211875, "genres"]["poetry"]

35

In [12]:
df["book_id"].describe()

count    2.360655e+06
mean     1.540953e+07
std      1.081897e+07
min      1.000000e+00
25%      5.979046e+06
50%      1.585681e+07
75%      2.438194e+07
max      3.653043e+07
Name: book_id, dtype: float64

In [13]:
df["book_id"].duplicated().sum()

np.int64(0)

### Transformation
1. Transformar tipos de datos.
2. Añadir valores `nan`.

In [14]:
g_nan = df["genres"].apply(lambda x: len(x) == 0)

In [16]:
df["book_id"] = df["book_id"].astype(np.uint32)
df["genres"] = df["genres"].apply(lambda d: {k: int(v) for k, v in d.items()})

In [17]:
df.loc[g_nan, "genres"] = np.nan
df.isna().sum()

book_id         0
genres     409513
dtype: int64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2360655 entries, 0 to 2360654
Data columns (total 2 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   book_id  uint32
 1   genres   object
dtypes: object(1), uint32(1)
memory usage: 27.0+ MB
