In [8]:
from core.goodreads import GoodReadsData
import gc

goodreads = GoodReadsData()
filename = goodreads.file_names[1]
filename

'goodreads_book_authors'

In [9]:
gc.collect()

0

# Análisis de `goodreads_book_authors`

In [10]:
import os
import pandas as pd
import numpy as np

In [11]:
# Download
if not os.path.exists(goodreads.get_file_path(filename)):
    goodreads.download_file(filename)
    
# Load
df = goodreads.load_file(filename)

In [12]:
gc.collect()

17

### Análisis
1. Comprender datos.
2. Detectar valores ausentes.
3. Detectar duplicados
4. Detectar valores erróneos.

In [13]:
df.head()

Unnamed: 0,average_rating,author_id,text_reviews_count,name,ratings_count
0,3.98,604031,7,Ronald J. Fields,49
1,4.08,626222,28716,Anita Diamant,546796
2,3.92,10333,5075,Barbara Hambly,122118
3,3.68,9212,36262,Jennifer Weiner,888522
4,3.82,149918,96,Nigel Pennick,1740


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 829529 entries, 0 to 829528
Data columns (total 5 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   average_rating      829529 non-null  float64
 1   author_id           829529 non-null  int64  
 2   text_reviews_count  829529 non-null  int64  
 3   name                829529 non-null  object 
 4   ratings_count       829529 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 31.6+ MB


In [15]:
(df == "").sum()

average_rating        0
author_id             0
text_reviews_count    0
name                  5
ratings_count         0
dtype: int64

In [16]:
df.describe()

Unnamed: 0,average_rating,author_id,text_reviews_count,ratings_count
count,829529.0,829529.0,829529.0,829529.0
mean,3.844779,5751610.0,106.865331,1595.326
std,0.603013,5129977.0,1770.225828,44796.69
min,0.0,3.0,0.0,0.0
25%,3.58,932718.0,2.0,8.0
50%,3.9,4952564.0,6.0,31.0
75%,4.17,7838936.0,20.0,131.0
max,5.0,17343370.0,448570.0,18532720.0


### Transform
1. Corregir posibles errores.
2. Transformar tipos de datos.
3. Establecer valores `nan`.

In [17]:
df["name"] = df["name"].str.strip()

In [18]:
n_nan = df["name"] == ""

In [19]:
df["average_rating"] = df["average_rating"].astype(np.float16)
df["author_id"] = df["author_id"].astype(np.uint32)
df["name"] = df["name"].astype("string")
df["text_reviews_count"] = df["text_reviews_count"].astype(np.uint32)
df["ratings_count"] = df["ratings_count"].astype(np.uint32)

In [20]:
df.loc[n_nan, "name"] = np.nan
df.isna().sum()

average_rating        0
author_id             0
text_reviews_count    0
name                  5
ratings_count         0
dtype: int64

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 829529 entries, 0 to 829528
Data columns (total 5 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   average_rating      829529 non-null  float16
 1   author_id           829529 non-null  uint32 
 2   text_reviews_count  829529 non-null  uint32 
 3   name                829524 non-null  string 
 4   ratings_count       829529 non-null  uint32 
dtypes: float16(1), string(1), uint32(3)
memory usage: 17.4 MB
