#Limpieza de Datos


Datos obtenidos de https://nijianmo.github.io/amazon/index.html#subsets

##MetaData
**reviewerID** - ID of the reviewer, e.g. A2SUAM1J3GNN3B

**asin** - ID of the product, e.g. 0000013714

**reviewerName** - name of the reviewer

**vote** - helpful votes of the review

**style** - a disctionary of the product metadata, e.g., "Format" is "Hardcover"

**reviewText** - text of the review

**overall** - rating of the product

**summary** - summary of the review

**unixReviewTime** - time of the review (unix time)

**reviewTime** - time of the review (raw)

**image** - images that users post after they have received the product


In [None]:
#Scrip para leer la data, adaptado de https://nijianmo.github.io/amazon/index.html#subsets
import pandas as pd
import gzip
import json

def parse(path,cant):
  i=0
  g = gzip.open(path, 'rb')
  for l in g:
    if i>=cant: break
    yield json.loads(l)
    i+=1

def getDF(path,cant):
  i = 0
  df = {}
  for d in parse(path,cant):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')



In [None]:
df = getDF('/content/drive/MyDrive/Proyecto-Tesis/Data/Electronics_5.json.gz',100000)
df.shape

(100000, 12)

In [None]:
df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,5.0,67,True,"09 18, 1999",AAP7PPBU72QFM,151004714,{'Format:': ' Hardcover'},D. C. Carrad,This is the best novel I have read in 2 or 3 y...,A star is born,937612800,
1,3.0,5,True,"10 23, 2013",A2E168DTVGE6SV,151004714,{'Format:': ' Kindle Edition'},Evy,"Pages and pages of introspection, in the style...",A stream of consciousness novel,1382486400,
2,5.0,4,False,"09 2, 2008",A1ER5AYS3FQ9O3,151004714,{'Format:': ' Paperback'},Kcorn,This is the kind of novel to read when you hav...,I'm a huge fan of the author and this one did ...,1220313600,
3,5.0,13,False,"09 4, 2000",A1T17LMQABMBN5,151004714,{'Format:': ' Hardcover'},Caf Girl Writes,What gorgeous language! What an incredible wri...,The most beautiful book I have ever read!,968025600,
4,3.0,8,True,"02 4, 2000",A3QHJ0FXK33OBE,151004714,{'Format:': ' Hardcover'},W. Shane Schmidt,I was taken in by reviews that compared this b...,A dissenting view--In part.,949622400,


In [None]:
#Eliminamos las columnas que no se van a usar
ls = ['reviewTime','style','reviewerName','unixReviewTime','image','verified']

In [None]:
df = df.drop(ls, axis=1)

In [None]:
#Contabilizamos vacíos
#Como la cantidad de vacios es minima, optamos por borrarlos.
#En el caso de la columan "vote", llenamos con 0
vacios = pd.DataFrame(df.isnull().sum()).sort_values(0,ascending=True)
vacios.columns = ['vacios']
vacios['vacios%'] = round(vacios['vacios']/df.shape[0], 2)*100
vacios 

Unnamed: 0,vacios,vacios%
overall,0,0.0
reviewerID,0,0.0
asin,0,0.0
summary,8,0.0
reviewText,14,0.0
vote,82843,83.0


In [None]:
df.drop(df[df['summary'].isnull() | df['reviewText'].isnull()].index,inplace=True)

In [None]:
df=df.fillna(0)

In [None]:
#Contabilizamos vacíos
vacios = pd.DataFrame(df.isnull().sum()).sort_values(0,ascending=True)
vacios.columns = ['vacios']
vacios['vacios%'] = round(vacios['vacios']/df.shape[0], 2)*100
vacios 

Unnamed: 0,vacios,vacios%
overall,0,0.0
vote,0,0.0
reviewerID,0,0.0
asin,0,0.0
reviewText,0,0.0
summary,0,0.0


In [None]:
# Asignamos el tipo de dato correspondiente
# En el caso de "vote" le asignamos primero un tipo "string" para eliminar primero "," como seprador de miles.
aux = ['reviewerID','asin','reviewText','summary']
df['vote'] = df['vote'].astype('string')
df[aux] = df[aux].astype('string')

In [None]:
df.dtypes

overall       float64
vote           string
reviewerID     string
asin           string
reviewText     string
summary        string
dtype: object

In [None]:
df['vote'] = df['vote'].str.replace(',','')

In [None]:
df['vote'] = df['vote'].astype('float64')

In [None]:
df.dtypes

overall       float64
vote          float64
reviewerID     string
asin           string
reviewText     string
summary        string
dtype: object

In [None]:
df['overall'].value_counts()

5.0    66824
4.0    17461
3.0     6888
1.0     5301
2.0     3505
Name: overall, dtype: int64

In [None]:
df[df['overall'].isnull()]


Unnamed: 0,overall,vote,reviewerID,asin,reviewText,summary


In [None]:
df.head()

Unnamed: 0,overall,vote,reviewerID,asin,reviewText,summary
0,5.0,67.0,AAP7PPBU72QFM,151004714,This is the best novel I have read in 2 or 3 y...,A star is born
1,3.0,5.0,A2E168DTVGE6SV,151004714,"Pages and pages of introspection, in the style...",A stream of consciousness novel
2,5.0,4.0,A1ER5AYS3FQ9O3,151004714,This is the kind of novel to read when you hav...,I'm a huge fan of the author and this one did ...
3,5.0,13.0,A1T17LMQABMBN5,151004714,What gorgeous language! What an incredible wri...,The most beautiful book I have ever read!
4,3.0,8.0,A3QHJ0FXK33OBE,151004714,I was taken in by reviews that compared this b...,A dissenting view--In part.


In [None]:
df.shape

(99979, 6)

In [None]:
df_copy = df.copy()

In [None]:
df_copy.head()

Unnamed: 0,overall,vote,reviewerID,asin,reviewText,summary
0,5.0,67.0,AAP7PPBU72QFM,151004714,This is the best novel I have read in 2 or 3 y...,A star is born
1,3.0,5.0,A2E168DTVGE6SV,151004714,"Pages and pages of introspection, in the style...",A stream of consciousness novel
2,5.0,4.0,A1ER5AYS3FQ9O3,151004714,This is the kind of novel to read when you hav...,I'm a huge fan of the author and this one did ...
3,5.0,13.0,A1T17LMQABMBN5,151004714,What gorgeous language! What an incredible wri...,The most beautiful book I have ever read!
4,3.0,8.0,A3QHJ0FXK33OBE,151004714,I was taken in by reviews that compared this b...,A dissenting view--In part.


In [None]:
# En este caso decidimos eliminar las siguientes columnas, ya que no seran necesarias. Sin embargo se deja el preprocesamiento anterior por
# si se requiriese alguno de las columnas.
delet_list = ['vote','reviewerID','asin','summary']
df_copy = df_copy.drop(delet_list ,axis=1)
df_copy.to_csv('data-1000000.csv',index=False)

In [None]:
#Guardamos los datos tratados, con las columans "overall" y "revierText"
aux = pd.read_csv('/content/data-1000000.csv')

In [None]:
aux.head()

Unnamed: 0,overall,reviewText
0,5.0,This is the best novel I have read in 2 or 3 y...
1,3.0,"Pages and pages of introspection, in the style..."
2,5.0,This is the kind of novel to read when you hav...
3,5.0,What gorgeous language! What an incredible wri...
4,3.0,I was taken in by reviews that compared this b...


### Pre - Procesamiento

In [None]:
!pip install nltk
!pip install sklearn

In [None]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize #Libreria para la tokenizacion de las palabras
from nltk.corpus import stopwords #Librerias para los stopwords o palabras sin significado

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def process_text(text):
    # eliminamos caracteres especiales
    text = re.sub('[^A-Za-z]', ' ', text.lower())

    tokenized_text = word_tokenize(text)

    #Removemos los stopwords y reducimos las palabras a su raiz
    clean_text = [
        stemmer.stem(word) for word in tokenized_text
        if word not in stopwords.words('english')
    ]

    return clean_text

In [None]:
tex = df['reviewText'].to_numpy()
tex[0]

'This is the best novel I have read in 2 or 3 years.  It is everything that fiction should be -- beautifully written, engaging, well-plotted and structured.  It has several layers of meanings -- historical, family,  philosophical and more -- and blends them all skillfully and interestingly.  It makes the American grad student/writers\' workshop "my parents were  mean to me and then my professors were mean to me" trivia look  childish and silly by comparison, as they are.\nAnyone who says this is an  adolescent girl\'s coming of age story is trivializing it.  Ignore them.  Read this book if you love literature.\nI was particularly impressed with  this young author\'s grasp of the meaning and texture of the lost world of  French Algeria in the 1950\'s and \'60\'s...particularly poignant when read in  1999 from another ruined and abandoned French colony, amid the decaying  buildings of Phnom Penh...\nI hope the author will write many more books  and that her publishers will bring her firs

In [None]:
# Se aplica el pre-procesamiento a cada comentario
texts = tex
texts = [" ".join(process_text(text)) for text in texts]

In [None]:
len(tex)

99979

In [None]:
#Texto antes del procesamiento
tex[0]

'This is the best novel I have read in 2 or 3 years.  It is everything that fiction should be -- beautifully written, engaging, well-plotted and structured.  It has several layers of meanings -- historical, family,  philosophical and more -- and blends them all skillfully and interestingly.  It makes the American grad student/writers\' workshop "my parents were  mean to me and then my professors were mean to me" trivia look  childish and silly by comparison, as they are.\nAnyone who says this is an  adolescent girl\'s coming of age story is trivializing it.  Ignore them.  Read this book if you love literature.\nI was particularly impressed with  this young author\'s grasp of the meaning and texture of the lost world of  French Algeria in the 1950\'s and \'60\'s...particularly poignant when read in  1999 from another ruined and abandoned French colony, amid the decaying  buildings of Phnom Penh...\nI hope the author will write many more books  and that her publishers will bring her firs

In [None]:
#Texto despues del procesamiento
texts[0]

'best novel read year everyth fiction beauti written engag well plot structur sever layer mean histor famili philosoph blend skill interestingli make american grad student writer workshop parent mean professor mean trivia look childish silli comparison anyon say adolesc girl come age stori trivial ignor read book love literatur particularli impress young author grasp mean textur lost world french algeria particularli poignant read anoth ruin abandon french coloni amid decay build phnom penh hope author write mani book publish bring first novel back print want read thank ms messud write wonder work'

In [None]:
len(texts)

99979

In [None]:
#Definimos la matriz de vectorizacion de cada comentario, con las 1000 palabras mas relevantes
from sklearn.feature_extraction.text import CountVectorizer
matrix = CountVectorizer(max_features = 1000)
vectors = matrix.fit_transform(texts)
count_array = vectors.toarray()

In [None]:
df_vector = pd.DataFrame(data=count_array,columns = matrix.get_feature_names_out())

In [None]:
df_vector.shape

(99979, 1000)

In [None]:
df_vector.head()

Unnamed: 0,aa,abil,abl,absolut,accept,access,accessori,accur,across,actual,...,would,write,wrong,xp,ye,year,yet,zip,zipper,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [None]:
df['overall'].values

array([5., 3., 5., ..., 5., 5., 5.])

In [None]:
df_vector['rating'] = df['overall'].values

In [None]:
df_vector.shape

(99979, 1001)

In [None]:
df_vector[df_vector['rating'].isnull()]

Unnamed: 0,aa,abil,abl,absolut,accept,access,accessori,accur,across,actual,...,write,wrong,xp,ye,year,yet,zip,zipper,zoom,rating


In [None]:
#Guardamos la data limpia
df_vector.to_csv('data-limmpia.csv',index=False)

In [None]:
aux = pd.read_csv('/content/data-limmpia.csv')

In [None]:
aux.head()

Unnamed: 0,aa,abil,abl,absolut,accept,access,accessori,accur,across,actual,...,write,wrong,xp,ye,year,yet,zip,zipper,zoom,rating
0,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,1,0,0,0,0,5.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,5.0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,3.0
