In [16]:
from core.goodreads import GoodReadsData
import gc

goodreads = GoodReadsData()
filename = goodreads.file_names[7]
filename

'goodreads_reviews_spoiler_raw'

In [17]:
gc.collect()

566

# Análisis de `goodreads_reviews_spoiler_raw`

In [18]:
import os
import pandas as pd
import numpy as np

In [19]:
# Download
if not os.path.exists(goodreads.get_file_path(filename)):
    goodreads.download_file(filename)
    
goodreads.count_samples(filename)

1378033

In [20]:
# Load
df = goodreads.load_file_range(filename, (0, 99999))

### Análisis
1. Comprensión de datos.
2. Detección de valores `nan`.
3. Detección de errores.
4. Detección de duplicados.

In [21]:
df.sample(5)

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
66817,3657976d84f4b678f5c272c04d2e7544,16156709,dfc20ddab828fc002a984ff26ff0cc1f,4,I liked Beneath This Man a lot better than I d...,Wed Jan 30 21:34:11 -0800 2013,Fri Feb 15 08:23:33 -0800 2013,Fri Feb 15 08:23:33 -0800 2013,Wed Jan 30 00:00:00 -0800 2013,0,0
6270,dd85d1745fe91b9984dc1c39efc8f257,10757833,daae18aa7960f3e5519318ac98731671,2,"It was decent, long and seemed to take forever...",Sat Apr 06 11:57:11 -0700 2013,Wed Apr 24 20:46:27 -0700 2013,Wed Apr 24 00:00:00 -0700 2013,Wed Apr 10 00:00:00 -0700 2013,0,0
64547,aefe566b7b0715a77f19883c060ad241,39033,4ac43ac2565d6aa049bb9ef2a1522f88,4,"Not one of the strongest books in the series, ...",Sun Sep 02 12:57:10 -0700 2012,Sun Sep 02 12:57:38 -0700 2012,Thu Sep 02 00:00:00 -0700 2010,,1,0
48187,d037f700c44eaf24d313bd997643abac,11838803,4944ff87d5d07ff6183143acc2f3320f,5,I love this series like a chocaholic loves cho...,Sun Nov 18 10:26:24 -0800 2012,Wed Nov 21 14:31:23 -0800 2012,Sun Nov 18 00:00:00 -0800 2012,Sat Nov 17 00:00:00 -0800 2012,0,0
24769,e9bdfc785d06e08334632692ffa48d50,96358,3ba39d1a88383994454ef0f353d8ea6e,4,I watched and listened to this as a wonderfull...,Wed Jul 15 12:17:36 -0700 2015,Tue Aug 04 11:32:31 -0700 2015,Wed Jul 15 00:00:00 -0700 2015,,0,0


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       100000 non-null  object
 1   book_id       100000 non-null  object
 2   review_id     100000 non-null  object
 3   rating        100000 non-null  int64 
 4   review_text   100000 non-null  object
 5   date_added    100000 non-null  object
 6   date_updated  100000 non-null  object
 7   read_at       100000 non-null  object
 8   started_at    100000 non-null  object
 9   n_votes       100000 non-null  int64 
 10  n_comments    100000 non-null  int64 
dtypes: int64(3), object(8)
memory usage: 8.4+ MB


In [23]:
(df == "").sum()

user_id             0
book_id             0
review_id           0
rating              0
review_text         0
date_added          0
date_updated        0
read_at          9448
started_at      32692
n_votes             0
n_comments          0
dtype: int64

In [24]:
df[["book_id", "rating", "n_votes", "n_comments"]].astype(np.int64).describe()

Unnamed: 0,book_id,rating,n_votes,n_comments
count,100000.0,100000.0,100000.0,100000.0
mean,13396470.0,3.7255,2.85335,0.95365
std,9204385.0,1.234085,12.840757,4.834416
min,1.0,0.0,-2.0,-1.0
25%,6339664.0,3.0,0.0,0.0
50%,13549820.0,4.0,0.0,0.0
75%,19358980.0,5.0,2.0,0.0
max,36328680.0,5.0,931.0,564.0


Observamos valores negativos en `n_votes` y `n_comments` erróneos que asumiremos como valor correcto su valor absoluto. También vemos valores de `rating` = 0 que consideraremos como valores ausentes (`nan`).

In [25]:
pd.to_datetime(df["date_added"], format="%a %b %d %H:%M:%S %z %Y", utc=True).min()

Timestamp('2006-08-29 18:20:05+0000', tz='UTC')

In [26]:
pd.to_datetime(df["date_added"], format="%a %b %d %H:%M:%S %z %Y", utc=True).max()

Timestamp('2017-10-27 17:15:51+0000', tz='UTC')

In [27]:
pd.to_datetime(df["date_updated"], format="%a %b %d %H:%M:%S %z %Y", utc=True).min()

Timestamp('2006-12-07 19:01:45+0000', tz='UTC')

In [28]:
pd.to_datetime(df["date_updated"], format="%a %b %d %H:%M:%S %z %Y", utc=True).max()

Timestamp('2017-11-03 14:43:26+0000', tz='UTC')

In [29]:
df["review_id"].duplicated().sum()

np.int64(0)

### Transformación
1. Eliminar columnas que no aportan información útil.
2. Correciión de erroes.
3. Transformación de tipos de datos.
4. Asignación de valores `nan`.

In [30]:
df.drop(columns=["read_at", "started_at"], inplace=True)

In [31]:
df["user_id"] = df["user_id"].str.replace(r"\s+", "", regex=True)
df["book_id"] = df["book_id"].str.replace(r"\s+", "", regex=True)
df["review_id"] = df["review_id"].str.replace(r"\s+", "", regex=True)
df["review_text"] = df["review_text"].str.strip()
df["date_added"] = df["date_added"].str.strip()
df["date_updated"] = df["date_updated"].str.strip()

df["n_votes"] = df["n_votes"].abs()
df["n_coments"] = df["n_comments"].abs()

In [32]:
r_nan = df["rating"] == 0

In [33]:
df["user_id"] = df["user_id"].astype("string")
df["book_id"] = df["book_id"].astype(np.uint32)
df["review_id"] = df["review_id"].astype("string")
df["review_text"] = df["review_text"].astype("string")
df["date_added"] = pd.to_datetime(df["date_added"], format="%a %b %d %H:%M:%S %z %Y", utc=True)
df["date_updated"] = pd.to_datetime(df["date_updated"], format="%a %b %d %H:%M:%S %z %Y", utc=True)

df["n_votes"] = df["n_votes"].astype(np.uint16)
df["n_coments"] = df["n_comments"].astype(np.uint16)
df["rating"] = df["rating"].astype(np.uint8)

In [34]:
df.loc[r_nan, "rating"] = np.nan
df.isna().sum()

user_id            0
book_id            0
review_id          0
rating          3077
review_text        0
date_added         0
date_updated       0
n_votes            0
n_comments         0
n_coments          0
dtype: int64

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype              
---  ------        --------------   -----              
 0   user_id       100000 non-null  string             
 1   book_id       100000 non-null  uint32             
 2   review_id     100000 non-null  string             
 3   rating        96923 non-null   float64            
 4   review_text   100000 non-null  string             
 5   date_added    100000 non-null  datetime64[ns, UTC]
 6   date_updated  100000 non-null  datetime64[ns, UTC]
 7   n_votes       100000 non-null  uint16             
 8   n_comments    100000 non-null  int64              
 9   n_coments     100000 non-null  uint16             
dtypes: datetime64[ns, UTC](2), float64(1), int64(1), string(3), uint16(2), uint32(1)
memory usage: 6.1 MB
