#Limpieza de Datos


Datos obtenidos de https://nijianmo.github.io/amazon/index.html#subsets

##MetaData
**reviewerID** - ID of the reviewer, e.g. A2SUAM1J3GNN3B

**asin** - ID of the product, e.g. 0000013714

**reviewerName** - name of the reviewer

**vote** - helpful votes of the review

**style** - a disctionary of the product metadata, e.g., "Format" is "Hardcover"

**reviewText** - text of the review

**overall** - rating of the product

**summary** - summary of the review

**unixReviewTime** - time of the review (unix time)

**reviewTime** - time of the review (raw)

**image** - images that users post after they have received the product


In [4]:
#Scrip para leer la data, adaptado de https://nijianmo.github.io/amazon/index.html#subsets
import pandas as pd
import gzip
import json

def parse(path,cant):
  i=0
  g = gzip.open(path, 'rb')
  for l in g:
    if i>=cant: break
    yield json.loads(l)
    i+=1

def getDF(path,cant):
  i = 0
  df = {}
  for d in parse(path,cant):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')



In [6]:
df = getDF('data/Books_5.json-008.gz',100000)
df.shape

(100000, 12)

In [7]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,False,"03 30, 2005",A1REUF3A1YCPHM,1713353,{'Format:': ' Hardcover'},TW Ervin II,"The King, the Mice and the Cheese by Nancy Gur...",A story children will love and learn from,1112140800,,
1,5.0,True,"06 20, 2016",AVP0HXC9FG790,1713353,,Amazon Customer,The kids loved it!,Five Stars,1466380800,,
2,5.0,True,"01 24, 2016",A324TTUBKTN73A,1713353,{'Format:': ' Paperback'},Tekla Borner,My students (3 & 4 year olds) loved this book!...,Five Stars,1453593600,,
3,5.0,False,"07 9, 2015",A2RE7WG349NV5D,1713353,{'Format:': ' Paperback'},Deborah K Woroniecki,LOVE IT,Five Stars,1436400000,,
4,5.0,True,"01 18, 2015",A32B7QIUDQCD0E,1713353,,E,Great!,Five Stars,1421539200,,


In [8]:
#Eliminamos las columnas que no se van a usar
ls = ['reviewTime','style','reviewerName','unixReviewTime','image','verified']

In [9]:
df = df.drop(ls, axis=1)

In [10]:
#Contabilizamos vacíos
#Como la cantidad de vacios es minima, optamos por borrarlos.
#En el caso de la columan "vote", llenamos con 0
vacios = pd.DataFrame(df.isnull().sum()).sort_values(0,ascending=True)
vacios.columns = ['vacios']
vacios['vacios%'] = round(vacios['vacios']/df.shape[0], 2)*100
vacios 

Unnamed: 0,vacios,vacios%
overall,0,0.0
reviewerID,0,0.0
asin,0,0.0
summary,5,0.0
reviewText,15,0.0
vote,81526,82.0


In [11]:
df.drop(df[df['summary'].isnull() | df['reviewText'].isnull()].index,inplace=True)

In [12]:
df=df.fillna(0)

In [13]:
#Contabilizamos vacíos
vacios = pd.DataFrame(df.isnull().sum()).sort_values(0,ascending=True)
vacios.columns = ['vacios']
vacios['vacios%'] = round(vacios['vacios']/df.shape[0], 2)*100
vacios 

Unnamed: 0,vacios,vacios%
overall,0,0.0
reviewerID,0,0.0
asin,0,0.0
reviewText,0,0.0
summary,0,0.0
vote,0,0.0


In [14]:
# Asignamos el tipo de dato correspondiente
# En el caso de "vote" le asignamos primero un tipo "string" para eliminar primero "," como seprador de miles.
aux = ['reviewerID','asin','reviewText','summary']
df['vote'] = df['vote'].astype('string')
df[aux] = df[aux].astype('string')

In [15]:
df.dtypes

overall              float64
reviewerID    string[python]
asin          string[python]
reviewText    string[python]
summary       string[python]
vote          string[python]
dtype: object

In [16]:
df['vote'] = df['vote'].str.replace(',','')

In [17]:
df['vote'] = df['vote'].astype('float64')

In [18]:
df.dtypes

overall              float64
reviewerID    string[python]
asin          string[python]
reviewText    string[python]
summary       string[python]
vote                 float64
dtype: object

In [19]:
df['overall'].value_counts()

overall
5.0    64209
4.0    19368
3.0     8765
2.0     4154
1.0     3484
Name: count, dtype: int64

In [20]:
df[df['overall'].isnull()]


Unnamed: 0,overall,reviewerID,asin,reviewText,summary,vote


In [21]:
df.head()

Unnamed: 0,overall,reviewerID,asin,reviewText,summary,vote
0,5.0,A1REUF3A1YCPHM,1713353,"The King, the Mice and the Cheese by Nancy Gur...",A story children will love and learn from,0.0
1,5.0,AVP0HXC9FG790,1713353,The kids loved it!,Five Stars,0.0
2,5.0,A324TTUBKTN73A,1713353,My students (3 & 4 year olds) loved this book!...,Five Stars,0.0
3,5.0,A2RE7WG349NV5D,1713353,LOVE IT,Five Stars,0.0
4,5.0,A32B7QIUDQCD0E,1713353,Great!,Five Stars,0.0


In [22]:
df.shape

(99980, 6)

In [23]:
df_copy = df.copy()

In [24]:
df_copy.head()

Unnamed: 0,overall,reviewerID,asin,reviewText,summary,vote
0,5.0,A1REUF3A1YCPHM,1713353,"The King, the Mice and the Cheese by Nancy Gur...",A story children will love and learn from,0.0
1,5.0,AVP0HXC9FG790,1713353,The kids loved it!,Five Stars,0.0
2,5.0,A324TTUBKTN73A,1713353,My students (3 & 4 year olds) loved this book!...,Five Stars,0.0
3,5.0,A2RE7WG349NV5D,1713353,LOVE IT,Five Stars,0.0
4,5.0,A32B7QIUDQCD0E,1713353,Great!,Five Stars,0.0


In [25]:
# En este caso decidimos eliminar las siguientes columnas, ya que no seran necesarias. Sin embargo se deja el preprocesamiento anterior por
# si se requiriese alguno de las columnas.
delet_list = ['vote','reviewerID','asin','summary']
df_copy = df_copy.drop(delet_list ,axis=1)
df_copy.to_csv('data-1000000.csv',index=False)

In [27]:
#Guardamos los datos tratados, con las columans "overall" y "revierText"
aux = pd.read_csv('data-1000000.csv')

In [28]:
aux.head()

Unnamed: 0,overall,reviewText
0,5.0,"The King, the Mice and the Cheese by Nancy Gur..."
1,5.0,The kids loved it!
2,5.0,My students (3 & 4 year olds) loved this book!...
3,5.0,LOVE IT
4,5.0,Great!


### Pre - Procesamiento

In [None]:
!pip install nltk
!pip install sklearn

In [29]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize #Libreria para la tokenizacion de las palabras
from nltk.corpus import stopwords #Librerias para los stopwords o palabras sin significado

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def process_text(text):
    # eliminamos caracteres especiales
    text = re.sub('[^A-Za-z]', ' ', text.lower())

    tokenized_text = word_tokenize(text)

    #Removemos los stopwords y reducimos las palabras a su raiz
    clean_text = [
        stemmer.stem(word) for word in tokenized_text
        if word not in stopwords.words('english')
    ]

    return clean_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kalli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kalli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
tex = df['reviewText'].to_numpy()
tex[0]

"The King, the Mice and the Cheese by Nancy Gurney is an excellent children's book.  It is one that I well remember from my own childhood and purchased for my daughter who loves it.\n\nIt is about a king who has trouble with rude mice eating his cheese. He consults his wise men and they suggest cats to chase away the mice. The cats become a nuisance, so the wise men recommend the king bring in dogs to chase the cats away.  The cycle goes on until the mice are finally brought back to chase away the elephants, brought in to chase away the lions that'd chased away the dogs.\n\nThe story ends in compromise and friendship between the mice and the king.  The story also teaches cause and effect relationships.\n\nThe pictures that accompany the story are humorous and memorable.  I was thrilled to discover that it is back in print.  I *highly* recommend it for children ages 2 to 7."

In [31]:
# Se aplica el pre-procesamiento a cada comentario
texts = tex
texts = [" ".join(process_text(text)) for text in texts]

In [35]:
len(tex)

99980

In [36]:
#Texto antes del procesamiento
tex[0]

"The King, the Mice and the Cheese by Nancy Gurney is an excellent children's book.  It is one that I well remember from my own childhood and purchased for my daughter who loves it.\n\nIt is about a king who has trouble with rude mice eating his cheese. He consults his wise men and they suggest cats to chase away the mice. The cats become a nuisance, so the wise men recommend the king bring in dogs to chase the cats away.  The cycle goes on until the mice are finally brought back to chase away the elephants, brought in to chase away the lions that'd chased away the dogs.\n\nThe story ends in compromise and friendship between the mice and the king.  The story also teaches cause and effect relationships.\n\nThe pictures that accompany the story are humorous and memorable.  I was thrilled to discover that it is back in print.  I *highly* recommend it for children ages 2 to 7."

In [37]:
#Texto despues del procesamiento
texts[0]

'king mice chees nanci gurney excel children book one well rememb childhood purchas daughter love king troubl rude mice eat chees consult wise men suggest cat chase away mice cat becom nuisanc wise men recommend king bring dog chase cat away cycl goe mice final brought back chase away eleph brought chase away lion chase away dog stori end compromis friendship mice king stori also teach caus effect relationship pictur accompani stori humor memor thrill discov back print highli recommend children age'

In [38]:
len(texts)

99980

In [41]:
#Definimos la matriz de vectorizacion de cada comentario, con las 1000 palabras mas relevantes
from sklearn.feature_extraction.text import CountVectorizer
matrix = CountVectorizer(max_features = 1000)
vectors = matrix.fit_transform(texts)
count_array = vectors.toarray()

In [42]:
df_vector = pd.DataFrame(data=count_array,columns = matrix.get_feature_names_out())

In [43]:
df_vector.shape

(99980, 1000)

In [44]:
df_vector.head()

Unnamed: 0,abil,abl,absolut,accept,account,accur,across,act,action,actual,...,write,writer,written,wrong,wrote,ye,year,yet,young,younger
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
df['overall'].values

array([5., 5., 5., ..., 2., 5., 4.])

In [46]:
df_vector['rating'] = df['overall'].values

In [47]:
df_vector.shape

(99980, 1001)

In [48]:
df_vector[df_vector['rating'].isnull()]

Unnamed: 0,abil,abl,absolut,accept,account,accur,across,act,action,actual,...,writer,written,wrong,wrote,ye,year,yet,young,younger,rating


In [49]:
#Guardamos la data limpia
df_vector.to_csv('data-limmpia.csv',index=False)

In [50]:
df_clean_coment = pd.DataFrame(texts, columns=['reviewText'])

In [53]:
df_clean_coment['rating'] = df['overall'].values
df_clean_coment.head()

Unnamed: 0,reviewText,rating
0,king mice chees nanci gurney excel children bo...,5.0
1,kid love,5.0
2,student year old love book definit recommend t...,5.0
3,love,5.0
4,great,5.0


In [55]:
df_clean_coment.to_csv('data_limpia_NV.csv',index=False)

In [None]:
aux = pd.read_csv('/content/data-limmpia.csv')

In [None]:
aux.head()

Unnamed: 0,aa,abil,abl,absolut,accept,access,accessori,accur,across,actual,...,write,wrong,xp,ye,year,yet,zip,zipper,zoom,rating
0,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,1,0,0,0,0,5.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,5.0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,3.0
