In [33]:
from core.goodreads import GoodReadsData
import gc

goodreads = GoodReadsData()
filename = goodreads.file_names[6]
filename

'goodreads_reviews_spoiler'

In [34]:
gc.collect()

271

# Análisis de `goodreads_reviews_spoiler`

In [35]:
import os
import pandas as pd
import numpy as np

In [36]:
# Download
if not os.path.exists(goodreads.get_file_path(filename)):
    goodreads.download_file(filename)
    
goodreads.count_samples(filename)

1378033

In [37]:
# Load
df = goodreads.load_file_range(filename, (0, 99999))

### Análisis
1. Comprensión de datos.
2. Detección de errores.
3. Detección de duplicados.

In [38]:
df.sample(5)

Unnamed: 0,user_id,timestamp,review_sentences,rating,has_spoiler,book_id,review_id
91090,0d81d76c4b6a2c7884d2392bc50c2b4f,2014-09-12,"[[0, Damn.], [0, The more I read, the slower m...",5,False,15717721,9194e648d4dd5d541a09b3e3ca6a739e
65535,5f8ca8da98445f41c625874e04115773,2015-01-02,"[[0, I'm completely happy with the end result ...",4,False,18052853,a34e8c2932788cdd1765fb8a4cbc510c
46091,92dc66385168612b674ba80eee2ba4e9,2014-10-08,"[[0, I'm one of those annoying 'read the book ...",4,False,8442457,44d0c40bf195982cff8be6409eed7f9f
55501,dae8c9afc16f5a492006edde9c3e40f9,2017-07-28,"[[0, While reading this, I couldn't help but w...",4,False,28862254,88abf81b0ba5a77e831e9604e8e17dbd
44755,cf201e1a8725af905c101c66d8e5c326,2016-08-31,"[[0, Pacing: The pacing of Fahrenheit 451 is a...",4,True,16280156,00a674a96b0a87b20ead65028a21ee8b


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_id           100000 non-null  object
 1   timestamp         100000 non-null  object
 2   review_sentences  100000 non-null  object
 3   rating            100000 non-null  int64 
 4   has_spoiler       100000 non-null  bool  
 5   book_id           100000 non-null  object
 6   review_id         100000 non-null  object
dtypes: bool(1), int64(1), object(5)
memory usage: 4.7+ MB


In [40]:
(df == "").sum()

user_id             0
timestamp           0
review_sentences    0
rating              0
has_spoiler         0
book_id             0
review_id           0
dtype: int64

In [41]:
(df["review_sentences"].apply(lambda x: len(x)==0)).sum()

np.int64(0)

In [42]:
df["has_spoiler"].iloc[20:25]

20     True
21    False
22     True
23     True
24    False
Name: has_spoiler, dtype: bool

In [43]:
df["review_sentences"].iloc[20]

[[0, "A fun fast paced book that sucks you in right away and doesn't let go."],
 [0,
  "The remnants of humanity live in a 150 story silo below the earth and can't leave as the outside is now toxic."],
 [0,
  'This is a bit of an odd thing to have happened, but it gives us a nice palette to think about things.'],
 [0, 'Everything is rationed, including how many children you can have.'],
 [0,
  'People self-identify by their profession and where in the silo they live: farmers and mechanics in the lowers, IT in the mids, and professional class in the uppers.'],
 [0,
  'The working class wear color coded uniforms to easily distinguish them.'],
 [0, 'The book was a lot about control.'],
 [0,
  'How to control a contained civilization, and give them hope (eg a live feed of the outside) and yet keep them in harmony and doing their functions.'],
 [0,
  'Strange mechanisms of control were built all over the silo: communication was oddly limited given the IT resources they have - many messages 

In [44]:
df["review_sentences"].iloc[25]

[[0,
  'A hilarious and book from Tina Fey - definitely worth listening to over audio as she reads it herself.'],
 [0,
  'She makes fun of herself a lot, adds lots of light humor, and of course shares TMI.'],
 [0,
  'The most interesting bit was hearing the behind the scenes of Saturday Night Live, and how she was a joke writer there for 3 years before they gave her a role on the show, which was only doing the news.'],
 [0,
  'Her breakout occurred after she had already left for 30 Rock, when she did the Sarah Palin impersonation.'],
 [0,
  'Tina is clearly an example of someone who has worked long and hard to get where she is.']]

`review_sentences` indica si la frase contiene o no spoilers.

In [45]:
df["rating"].describe()

count    100000.000000
mean          3.725500
std           1.234085
min           0.000000
25%           3.000000
50%           4.000000
75%           5.000000
max           5.000000
Name: rating, dtype: float64

In [46]:
(df["rating"]==0).sum()

np.int64(3077)

Los valores de `rating` = 0 son valores erróneos y se indicarán con `nan`.

In [47]:
pd.to_datetime(df["timestamp"]).min()

Timestamp('2006-12-07 00:00:00')

In [48]:
pd.to_datetime(df["timestamp"]).max()

Timestamp('2017-11-03 00:00:00')

In [49]:
df["review_id"].duplicated().sum()

np.int64(0)

### Transformación
1. Correción de errores.
2. Transformar tipos de datos.
3. Asignación de valores `nan`.

In [50]:
df["user_id"] = df["user_id"].str.replace(r"\s+", "", regex=True)
df["book_id"] = df["book_id"].str.replace(r"\s+", "", regex=True)
df["review_id"] = df["review_id"].str.replace(r"\s+", "", regex=True)
df["timestamp"] = df["timestamp"].str.strip()

In [51]:
r_nan = df["rating"] == 0

In [52]:
df["user_id"] = df["user_id"].astype("string")
df["review_id"] = df["review_id"].astype("string")
df["book_id"] = df["book_id"].astype("string")
df["rating"] = df["rating"].astype(np.uint8)
df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y-%m-%d")

In [54]:
df.loc[r_nan, "rating"] = np.nan
df.isna().sum()

user_id                0
timestamp              0
review_sentences       0
rating              3077
has_spoiler            0
book_id                0
review_id              0
dtype: int64

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   user_id           100000 non-null  string        
 1   timestamp         100000 non-null  datetime64[ns]
 2   review_sentences  100000 non-null  object        
 3   rating            96923 non-null   float64       
 4   has_spoiler       100000 non-null  bool          
 5   book_id           100000 non-null  string        
 6   review_id         100000 non-null  string        
dtypes: bool(1), datetime64[ns](1), float64(1), object(1), string(3)
memory usage: 4.7+ MB
