In [23]:
from core.goodreads import GoodReadsData
import gc

goodreads = GoodReadsData()
filename = goodreads.file_names[11]
filename

'goodreads_interactions_dedup'

In [24]:
gc.collect()

0

# Análisis de `goodreads_interactions_dedup`

In [25]:
import os
import numpy as np
import pandas as pd

In [26]:
# Download
if not os.path.exists(goodreads.get_file_path(filename)):
    goodreads.download_file(filename)
    
goodreads.count_samples(filename)

228648342

In [27]:
df = goodreads.load_file_range(filename, (0, 99999))

### Análisis
1. Comprensión de datos
2. Detección de valores `nan`
3. Detección de erroes
4. Detección de duplicados

In [28]:
df.sample(5)

Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
86501,0757e6c8076682b47d9d4dcebb6db776,17446940,4c85265afb14ccbd32784df82c89da35,False,0,,Mon May 13 11:13:23 -0700 2013,Mon May 13 11:13:25 -0700 2013,,
18321,83d6e6f80d7c32c6676b3ab3b01543cd,30233042,66fcfaf786226e1eb656e7d7c8f9df54,False,0,,Wed Dec 28 03:58:46 -0800 2016,Wed Dec 28 03:58:46 -0800 2016,,
84566,601509eb575b04b62a647536033e5c98,22752758,af3cec4fc5f87d8643b5c4ca090662ab,False,0,,Fri Jan 01 11:19:59 -0800 2016,Fri Jan 01 11:19:59 -0800 2016,,
26174,7f3fb103a5597aae9ff34e5ac49a77bb,59263,95d635b2e50c9182e342bcc0f7c525bc,True,4,,Mon Jul 09 20:07:32 -0700 2012,Mon Jul 09 20:07:40 -0700 2012,,
82823,1afe8b35c5e568e95bc17e5b5cdbfd1b,26760091,cc2cdeb31c3ae7f5c748a6a456e7e604,True,4,,Sun Jun 05 23:26:33 -0700 2016,Sun Jun 12 23:21:06 -0700 2016,Sun Jun 12 23:21:05 -0700 2016,Sun Jun 05 23:26:34 -0700 2016


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   user_id                 100000 non-null  object
 1   book_id                 100000 non-null  object
 2   review_id               100000 non-null  object
 3   is_read                 100000 non-null  bool  
 4   rating                  100000 non-null  int64 
 5   review_text_incomplete  100000 non-null  object
 6   date_added              100000 non-null  object
 7   date_updated            100000 non-null  object
 8   read_at                 100000 non-null  object
 9   started_at              100000 non-null  object
dtypes: bool(1), int64(1), object(8)
memory usage: 7.0+ MB


In [30]:
df.isna().sum()

user_id                   0
book_id                   0
review_id                 0
is_read                   0
rating                    0
review_text_incomplete    0
date_added                0
date_updated              0
read_at                   0
started_at                0
dtype: int64

In [31]:
(df == "").sum()

user_id                       0
book_id                       0
review_id                     0
is_read                       0
rating                        0
review_text_incomplete    91708
date_added                    0
date_updated                  0
read_at                   79258
started_at                83045
dtype: int64

In [32]:
df.loc[df["review_text_incomplete"]!="", "review_text_incomplete"].sample(5)

62193    I'm beginning to think there isn't a celebrity...
20013    This is a terrific collection, full of stories...
83747                     put on self finish it up someday
53860    This is probably my third time reading the Chr...
689      Seven amazing stories. Each one you think can'...
Name: review_text_incomplete, dtype: object

In [33]:
df.loc[78218, "review_text_incomplete"]

"The translation wasn't great--wish I could find it in French--but a true mountaineering classic. The medical treatments after the summit are just plain horrifying."

In [34]:
df["rating"].describe()

count    100000.000000
mean          1.722180
std           2.045776
min           0.000000
25%           0.000000
50%           0.000000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

Observamos valores mínimos a 0 que consideraremos como `nan`.

In [35]:
pd.to_datetime(df["date_added"], format="%a %b %d %H:%M:%S %z %Y", utc=True).min()

Timestamp('2006-08-20 02:10:46+0000', tz='UTC')

In [36]:
pd.to_datetime(df["date_added"], format="%a %b %d %H:%M:%S %z %Y", utc=True).max()

Timestamp('2017-10-29 22:26:11+0000', tz='UTC')

In [37]:
pd.to_datetime(df["date_updated"], format="%a %b %d %H:%M:%S %z %Y", utc=True).min()

Timestamp('2006-08-29 18:44:38+0000', tz='UTC')

In [38]:
pd.to_datetime(df["date_updated"], format="%a %b %d %H:%M:%S %z %Y", utc=True).max()

Timestamp('2017-10-29 22:26:12+0000', tz='UTC')

In [39]:
df.duplicated(["review_id"]).sum()

np.int64(0)

### Transformación
1. Eliminar columnas que no aportan información.
2. Corrección de posibles errores.
3. Transformar tipos de datos.
4. Incluir `nan`.

In [40]:
df.drop(columns=["started_at", "read_at"], inplace=True)

In [41]:
df["user_id"] = df["user_id"].str.replace(r"\s+", "", regex=True)
df["review_id"] = df["review_id"].str.replace(r"\s+", "", regex=True)
df["review_text_incomplete"] = df["review_text_incomplete"].str.strip()

In [42]:
rti_nan = df["review_text_incomplete"] == ""
r_nan = df["rating"] == 0

In [43]:
df["user_id"] = df["user_id"].astype("string")
df["review_id"] = df["review_id"].astype("string")
df["review_text_incomplete"] = df["review_text_incomplete"].astype("string")
df["book_id"] = df["book_id"].astype(np.uint32)
df["date_added"] = pd.to_datetime(df["date_added"], format="%a %b %d %H:%M:%S %z %Y", utc=True)
df["date_updated"] = pd.to_datetime(df["date_updated"], format="%a %b %d %H:%M:%S %z %Y", utc=True)
df["book_id"] = df["book_id"].astype(np.uint8)

In [44]:
df.loc[rti_nan, "review_text_incomplete"] = np.nan
df.loc[r_nan, "rating"] = np.nan

df.isna().sum()

user_id                       0
book_id                       0
review_id                     0
is_read                       0
rating                    55955
review_text_incomplete    91708
date_added                    0
date_updated                  0
dtype: int64

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column                  Non-Null Count   Dtype              
---  ------                  --------------   -----              
 0   user_id                 100000 non-null  string             
 1   book_id                 100000 non-null  uint8              
 2   review_id               100000 non-null  string             
 3   is_read                 100000 non-null  bool               
 4   rating                  44045 non-null   float64            
 5   review_text_incomplete  8292 non-null    string             
 6   date_added              100000 non-null  datetime64[ns, UTC]
 7   date_updated            100000 non-null  datetime64[ns, UTC]
dtypes: bool(1), datetime64[ns, UTC](2), float64(1), string(3), uint8(1)
memory usage: 4.8 MB
