# Extract Transform Load (ETL)

In [1]:
import pandas as pd
import numpy as np
import json
from textblob import TextBlob


### Analizando el archivo australian_users_reviews.json

In [2]:
# Se define el nombre del archivo que contiene las reseñas de usuarios australianos en formato JSON.
reviews = r'C:\Users\Coder\Documents\PI_ML_OPS\Datos\australian_user_reviews.json'

# Se abre el archivo en modo de lectura ('r') con codificación UTF-8.
with open(reviews, 'r', encoding='utf-8') as df:
    # Se leen todas las líneas del archivo y se almacenan en la variable dataf.
    dataf = df.readlines()

# Se crea una lista llamada 'data' mediante la evaluación de cada línea del archivo usando la función eval().
# Esto convierte las líneas del archivo JSON en objetos de Python.
data = [eval(line.strip()) for line in dataf]

# Se crea un DataFrame de pandas llamado 'dfreviews' usando la lista de objetos 'data'.
dfreviews= pd.DataFrame(data)

In [3]:
dfreviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


In [4]:
dfreviews.sample(10)

Unnamed: 0,user_id,user_url,reviews
2100,senpai_the_slutty,http://steamcommunity.com/id/senpai_the_slutty,"[{'funny': '', 'posted': 'Posted May 2, 2015.'..."
12577,Packmanman,http://steamcommunity.com/id/Packmanman,"[{'funny': '', 'posted': 'Posted June 15, 2014..."
6158,moon910,http://steamcommunity.com/id/moon910,"[{'funny': '', 'posted': 'Posted October 20, 2..."
23705,76561198085057414,http://steamcommunity.com/profiles/76561198085...,"[{'funny': '', 'posted': 'Posted November 13, ..."
21940,76561198068547407,http://steamcommunity.com/profiles/76561198068...,"[{'funny': '', 'posted': 'Posted March 10.', '..."
23846,76561198086567101,http://steamcommunity.com/profiles/76561198086...,"[{'funny': '', 'posted': 'Posted December 26, ..."
14926,norsktf,http://steamcommunity.com/id/norsktf,"[{'funny': '', 'posted': 'Posted May 9, 2015.'..."
19674,76561198042219417,http://steamcommunity.com/profiles/76561198042...,"[{'funny': '', 'posted': 'Posted June 14, 2014..."
4513,qwevor,http://steamcommunity.com/id/qwevor,"[{'funny': '', 'posted': 'Posted December 26, ..."
23552,76561198083517568,http://steamcommunity.com/profiles/76561198083...,"[{'funny': '1 person found this review funny',..."


In [5]:
# Observamos con detalle el contenido de la columna "reviews"
dfreviews['reviews'][0]

[{'funny': '',
  'posted': 'Posted November 5, 2011.',
  'last_edited': '',
  'item_id': '1250',
  'helpful': 'No ratings yet',
  'recommend': True,
  'review': 'Simple yet with great replayability. In my opinion does "zombie" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth "zombie" splattering fun for the whole family. Amazed this sort of FPS is so rare.'},
 {'funny': '',
  'posted': 'Posted July 15, 2011.',
  'last_edited': '',
  'item_id': '22200',
  'helpful': 'No ratings yet',
  'recommend': True,
  'review': "It's unique and worth a playthrough."},
 {'funny': '',
  'posted': 'Posted April 21, 2011.',
  'last_edited': '',
  'item_id': '43110',
  'helpful': 'No ratings yet',
  'recommend': True,
  'review': 'Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so buy the game so I get a sequel!'}]

In [6]:

# Extraemos los datos de la columna reviews. 
dfreviewsDicc = dfreviews.explode('reviews') 
#Concatenamos los dos dataframes 
dfreviews = pd.concat([dfreviewsDicc.drop(['reviews'], axis=1), dfreviewsDicc['reviews'].apply(pd.Series)], axis=1)



In [7]:
dfreviews.head(1)



Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,


In [8]:
# Eliminamos la columna 0
dfreviews.drop(dfreviews.columns.values[9], axis=1, inplace=True)
dfreviews.head(1)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...


In [9]:
dfreviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59333 entries, 0 to 25798
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      59333 non-null  object
 1   user_url     59333 non-null  object
 2   funny        59305 non-null  object
 3   posted       59305 non-null  object
 4   last_edited  59305 non-null  object
 5   item_id      59305 non-null  object
 6   helpful      59305 non-null  object
 7   recommend    59305 non-null  object
 8   review       59305 non-null  object
dtypes: object(9)
memory usage: 4.5+ MB


In [10]:
# Reemplazamos los valores faltantes con cadenas vacías y verificamos los valores que estén duplicados
dfreviews.fillna('',inplace= True)
dfreviews[dfreviews['user_url'].duplicated(keep=False)].sort_values(by='user_id')

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
25239,--ace--,http://steamcommunity.com/id/--ace--,,"Posted May 30, 2014.",,113200,0 of 1 people (0%) found this review helpful,True,One Of The Funnyest Games That Is Animated :) ...
25239,--ace--,http://steamcommunity.com/id/--ace--,,"Posted January 24, 2014.",,440,2 of 3 people (67%) found this review helpful,True,the best game i ever plllayed
12954,--ionex--,http://steamcommunity.com/id/--ionex--,,Posted March 2.,,730,No ratings yet,True,"it done brokeded on me, the game no longer wor..."
12954,--ionex--,http://steamcommunity.com/id/--ionex--,,"Posted August 15, 2015.",,105600,No ratings yet,True,"It's an amazing game, and im glad that mac use..."
18659,-2SV-vuLB-Kg,http://steamcommunity.com/id/-2SV-vuLB-Kg,2 people found this review funny,"Posted January 12, 2015.",,277950,18 of 21 people (86%) found this review helpful,True,This is a masterpiece! In this MOVA crossed RP...
...,...,...,...,...,...,...,...,...,...
24058,zyr0n1c,http://steamcommunity.com/id/zyr0n1c,,"Posted February 1, 2015.",,730,2 of 5 people (40%) found this review helpful,True,"After playing 500 hours on this fantastic FPS,..."
24058,zyr0n1c,http://steamcommunity.com/id/zyr0n1c,,"Posted December 23, 2013.","Last edited September 25, 2014.",72850,1 of 2 people (50%) found this review helpful,True,It's been a long way since Elder Scrolls start...
24058,zyr0n1c,http://steamcommunity.com/id/zyr0n1c,,"Posted March 26, 2015.",,230410,1 of 1 people (100%) found this review helpful,True,Fantastic game! Lots of gamemodes and large va...
24058,zyr0n1c,http://steamcommunity.com/id/zyr0n1c,,"Posted December 23, 2013.",,620,4 of 6 people (67%) found this review helpful,True,Fantastic Game! It allows one to think really ...


In [11]:
#Eliminamos las columnas que no poseen información relevante:
dfreviews = dfreviews.drop(['user_url','funny','last_edited','helpful'],axis=1)
dfreviews

Unnamed: 0,user_id,posted,item_id,recommend,review
0,76561197970982479,"Posted November 5, 2011.",1250,True,Simple yet with great replayability. In my opi...
0,76561197970982479,"Posted July 15, 2011.",22200,True,It's unique and worth a playthrough.
0,76561197970982479,"Posted April 21, 2011.",43110,True,Great atmosphere. The gunplay can be a bit chu...
1,js41637,"Posted June 24, 2014.",251610,True,I know what you think when you see this title ...
1,js41637,"Posted September 8, 2013.",227300,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...
25797,76561198312638244,Posted July 10.,70,True,a must have classic from steam definitely wort...
25797,76561198312638244,Posted July 8.,362890,True,this game is a perfect remake of the original ...
25798,LydiaMorley,Posted July 3.,273110,True,had so much fun plaing this and collecting res...
25798,LydiaMorley,Posted July 20.,730,True,:D


In [12]:
# Eiminamos los duplicados creando una serie booleana:
# True para cada elemento que es un duplicado, y False para el primero de cada conjunto de duplicados.
duplicados = dfreviews.index.duplicated(keep='first') 
dfreviews = dfreviews[~duplicados] 
dfreviews

Unnamed: 0,user_id,posted,item_id,recommend,review
0,76561197970982479,"Posted November 5, 2011.",1250,True,Simple yet with great replayability. In my opi...
1,js41637,"Posted June 24, 2014.",251610,True,I know what you think when you see this title ...
2,evcentric,Posted February 3.,248820,True,A suitably punishing roguelike platformer. Wi...
3,doctr,"Posted October 14, 2013.",250320,True,This game... is so fun. The fight sequences ha...
4,maplemage,"Posted April 15, 2014.",211420,True,Git gud
...,...,...,...,...,...
25794,76561198306599751,Posted May 31.,261030,True,I cried in the end its so sadding ]'; I wish l...
25795,Ghoustik,Posted June 17.,730,True,Gra naprawdę fajna.Ale jest kilka rzeczy do kt...
25796,76561198310819422,Posted June 23.,570,True,Well Done
25797,76561198312638244,Posted July 21.,233270,True,this is a very fun and nice 80s themed shooter...


In [13]:
dfreviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25799 entries, 0 to 25798
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    25799 non-null  object
 1   posted     25799 non-null  object
 2   item_id    25799 non-null  object
 3   recommend  25799 non-null  object
 4   review     25799 non-null  object
dtypes: object(5)
memory usage: 1.2+ MB


#### Modificando el formato de fecha en la columna' posted'

In [14]:
#Observamos que cada registro poseee la palabra posted. Procedemos a eliminarla en cada fila:
dfreviews['posted']= dfreviews['posted'].str.replace('Posted ', '')
#Eliminamos los puntos, reemplazamos las comas con espacio y los espacios por un guiones. 
dfreviews['posted']= dfreviews['posted'].str.replace('.', '')
dfreviews['posted']= dfreviews['posted'].str.replace(', ', '-')
dfreviews['posted']= dfreviews['posted'].str.replace(' ', '-')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfreviews['posted']= dfreviews['posted'].str.replace('Posted ', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfreviews['posted']= dfreviews['posted'].str.replace('.', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfreviews['posted']= dfreviews['posted'].str.replace(', ', '-')
A value is 

In [15]:
dfreviews['posted']

0        November-5-2011
1           June-24-2014
2             February-3
3        October-14-2013
4          April-15-2014
              ...       
25794             May-31
25795            June-17
25796            June-23
25797            July-21
25798             July-3
Name: posted, Length: 25799, dtype: object

In [16]:
# Cambiamos el formato de la columna posted a un formato de fecha

dfreviews['posted'] = pd.to_datetime(dfreviews['posted'], format= '%B-%d-%Y', errors='coerce')
dfreviews['posted']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfreviews['posted'] = pd.to_datetime(dfreviews['posted'], format= '%B-%d-%Y', errors='coerce')


0       2011-11-05
1       2014-06-24
2              NaT
3       2013-10-14
4       2014-04-15
           ...    
25794          NaT
25795          NaT
25796          NaT
25797          NaT
25798          NaT
Name: posted, Length: 25799, dtype: datetime64[ns]

In [17]:
# Modificamos la fecha para que quede sólo el año

dfreviews['posted'] = dfreviews['posted'].dt.year
dfreviews

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfreviews['posted'] = dfreviews['posted'].dt.year


Unnamed: 0,user_id,posted,item_id,recommend,review
0,76561197970982479,2011.0,1250,True,Simple yet with great replayability. In my opi...
1,js41637,2014.0,251610,True,I know what you think when you see this title ...
2,evcentric,,248820,True,A suitably punishing roguelike platformer. Wi...
3,doctr,2013.0,250320,True,This game... is so fun. The fight sequences ha...
4,maplemage,2014.0,211420,True,Git gud
...,...,...,...,...,...
25794,76561198306599751,,261030,True,I cried in the end its so sadding ]'; I wish l...
25795,Ghoustik,,730,True,Gra naprawdę fajna.Ale jest kilka rzeczy do kt...
25796,76561198310819422,,570,True,Well Done
25797,76561198312638244,,233270,True,this is a very fun and nice 80s themed shooter...


Eliminamos los valores nulos

In [18]:
dfreviews = dfreviews.dropna()
dfreviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21069 entries, 0 to 25780
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   user_id    21069 non-null  object 
 1   posted     21069 non-null  float64
 2   item_id    21069 non-null  object 
 3   recommend  21069 non-null  object 
 4   review     21069 non-null  object 
dtypes: float64(1), object(4)
memory usage: 987.6+ KB


Convertimos el tipo de dato de la columna 'recommend' de "objec" a "bool" 

In [19]:

dfreviews['recommend'] = dfreviews['recommend'].astype(bool)
# Verificarmos el cambio
print(dfreviews.dtypes) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfreviews['recommend'] = dfreviews['recommend'].astype(bool)


user_id       object
posted       float64
item_id       object
recommend       bool
review        object
dtype: object


Convertimos el tipo de dato de la columna 'user_id' de "objec" a "str" 

In [20]:
dfreviews['user_id'] = dfreviews['user_id'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfreviews['user_id'] = dfreviews['user_id'].astype(str)


#### Implementación del anális de sentimiento para la columna 'review'

La columna reviews posee comentarios que realizaron los usuarios dando su valoración sobre el juego. Ésta es una iformación muy importante para uno de nuestros endpoint. Para poder usar ésta información tendremos que transformar los comentarios a un tipo de dato numérico que nos permita asignarle un valor, por ejemplo, si el juego es considerado malo le asignaremos un '0', si es regular o neutral un '1' y si fue del agrado del jugador le asignaremos el número '2'.

In [30]:

def analyze_sentiment(review):
    # Verificarmos si el registro es nulo o vacío
    if not review or pd.isnull(review):
        return 1  # Se asigna el valor 1 para valores nulos o vacíos
    
    # Creamos un objeto TextBlob con la revisión
    blob = TextBlob(review)
    
    # Se establece los valores o polaridad del sentimiento (rango de -1 a 1)
    sentiment_polarity = blob.sentiment.polarity
    
    # Asignar los valores numéricos según la escala
    if sentiment_polarity < -0.1:
        return 0  # Para la calificación mala o reseña negativa
    elif sentiment_polarity <= 0.1:
        return 1  # Para la calificación regular o neutral
    else:
        return 2  # Para la calificación buena
# Asignar directamente los valores numéricos a la columna 'sentiment_analysis'
dfreviews['sentiment_analysis'] = dfreviews['review'].apply(analyze_sentiment)

# Verificamos el cambio
print(dfreviews[['review', 'sentiment_analysis']])

                                                  review  sentiment_analysis
0      Simple yet with great replayability. In my opi...                   2
1      I know what you think when you see this title ...                   2
3      This game... is so fun. The fight sequences ha...                   2
4                                                Git gud                   1
5                               This game is Marvellous.                   0
...                                                  ...                 ...
25764                                     its FUNNNNNNNN                   1
25765  Awesome fantasy game if you don't mind the gra...                   2
25769                                   Prettyy Mad Game                   0
25771                                 AMAZING GAME 10/10                   2
25780  Why I voted yes? 1. Girl characters have boobs...                   2

[21069 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfreviews['sentiment_analysis'] = dfreviews['review'].apply(analyze_sentiment)


In [31]:
dfreviews

Unnamed: 0,user_id,posted,item_id,recommend,review,sentiment_analysis
0,76561197970982479,2011.0,1250,True,Simple yet with great replayability. In my opi...,2
1,js41637,2014.0,251610,True,I know what you think when you see this title ...,2
3,doctr,2013.0,250320,True,This game... is so fun. The fight sequences ha...,2
4,maplemage,2014.0,211420,True,Git gud,1
5,Wackky,2014.0,249130,True,This game is Marvellous.,0
...,...,...,...,...,...,...
25764,wayfeng,2015.0,730,True,its FUNNNNNNNN,1
25765,76561198251004808,2015.0,253980,True,Awesome fantasy game if you don't mind the gra...,2
25769,72947282842,2015.0,730,True,Prettyy Mad Game,0
25771,ApxLGhost,2015.0,730,True,AMAZING GAME 10/10,2


In [32]:
dfreviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21069 entries, 0 to 25780
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             21069 non-null  object 
 1   posted              21069 non-null  float64
 2   item_id             21069 non-null  object 
 3   recommend           21069 non-null  bool   
 4   review              21069 non-null  object 
 5   sentiment_analysis  21069 non-null  int64  
dtypes: bool(1), float64(1), int64(1), object(3)
memory usage: 1008.2+ KB


Guardamos el dataframe en un archivo CSV

In [33]:
dfreviews.to_csv(r'C:\Users\Coder\Documents\PI_ML_OPS\Datos\reviews1.csv', index=False, errors='replace', encoding='utf-8')

Creamos un dataframe con el archivo reviews1.csv:

In [34]:
dfrevparquet = pd.read_csv(r'C:\Users\Coder\Documents\PI_ML_OPS\Datos\reviews1.csv')


Guardamos el dataframe 'dfrevparquet' en un archivo .parquet

In [35]:
dfrevparquet.to_parquet(r'C:\Users\Coder\Documents\PI_ML_OPS\Datos\reviews2.parquet')

In [36]:
del dfrevparquet

In [37]:
dfreviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21069 entries, 0 to 25780
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             21069 non-null  object 
 1   posted              21069 non-null  float64
 2   item_id             21069 non-null  object 
 3   recommend           21069 non-null  bool   
 4   review              21069 non-null  object 
 5   sentiment_analysis  21069 non-null  int64  
dtypes: bool(1), float64(1), int64(1), object(3)
memory usage: 1008.2+ KB
