In [1]:
# Conexión entre Google Colab y Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **<font color="#ff5653">Extract, transform and load</font>**

Archivos de Steam:
- steam_games.json.gz
- users_items.json.gz
- user_reviews.json.gz

Se llevara a cabo la apertura de los archivos, EDA preliminar y ETL hasta obtener un dataframe de cada archivo limpio para ser procesado posteriormente.

In [2]:
# Libreria
import pandas as pd
import gzip                            # Archivos_comprimidos.gz
import json                            # Archivos.json
from textblob import TextBlob          # Analisis de sentimiento
import csv                             # Archivos.csv
import ast                             # Gramatica de sintaxis abstracta - Abstract Syntax

## **<font color="#ff5653">steam_games.jso.gz</font>**

In [None]:
# Funcion para descomprimir, abrir y generar el DataFrame
def steam_games(archivo_json_gz):
    data = []
    with gzip.open(archivo_json_gz, 'rb') as f:
        for line in f:
            line_content = line.decode('utf-8')
            data.append(json.loads(line_content))
    df_steam_games = pd.json_normalize(data)
    return df_steam_games

In [None]:
# Llamamos a la función
df_steam_games = steam_games("/content/drive/MyDrive/Datasets_Steam/steam_games.json.gz")

Exploratory Data Analysis (EDA) - Análisis Exploratorio de Datos

In [None]:
# Vemos el DataFrame
df_steam_games

In [None]:
# Informacion de no nulos y tipos de datos.
df_steam_games.info()

In [None]:
# Suma de valores nulos (NaN) en cada columna
df_steam_games.isna().sum()

In [None]:
# Cantidad de filas y columnas
df_steam_games.shape

(120445, 13)

Extract, Transform and Load (ETL) - Extracción, Transformación y Carga.

In [None]:
# Copiamos el archivo original
df_steam_games1 = df_steam_games.copy()

In [None]:
# Eliminamos filas que solo contengan todos sus valores nulos.
df_steam_games1 = df_steam_games1.dropna(how='all')

In [None]:
# Vemos el DataFrame
df_steam_games1

In [None]:
# Cantidad de filas y columnas
df_steam_games1.shape

(32135, 13)

In [None]:
# Ordenamos el indice del DataFrame
df_steam_games1.reset_index(drop=True, inplace=True)

In [None]:
# Vemos el DataFrame con el indice ordenado
df_steam_games

In [None]:
# Guardamos el DataFrame en un archivo .csv

df_steam_games1.to_csv("steam_games_ETL.csv", index=False)

- Teniamos originalmente: 120.445 filas y 13 columnas.
- Ahora tenemos: 32.135 filas y 13 columnas.

## **<font color="#ff5653">users_items.jso.gz</font>**

In [None]:
# Funcion para descomprimir, abrir y generar el DataFrame
def users_items(archivo_json_gz):
    with gzip.open(archivo_json_gz, "rt", encoding="iso-8859-1", errors="ignore") as file:
        data = [ast.literal_eval(line) for line in file]
        df_users_items = pd.DataFrame(data)
    return df_users_items

In [None]:
# Llamamos a la función
df_users_items = users_items("/content/drive/MyDrive/Datasets_Steam/users_items.json.gz")

Exploratory Data Analysis (EDA) - Análisis Exploratorio de Datos.

In [None]:
# Vemos el DataFrame
df_users_items

In [None]:
# Informacion de no nulos y tipos de datos.
df_users_items.info()

In [None]:
# Suma de valores nulos (NaN) en cada columna
df_users_items.isna().sum()

In [None]:
# Cantidad de filas y columnas
df_users_items.shape

(88310, 5)

Extract, Transform and Load (ETL) - Extracción, Transformación y Carga.

In [None]:
# Copiamos el archivo original
df_users_items1 = df_users_items.copy()

In [None]:
# Desanidamos la columna "items"
# Fragmentamos el archivo, para poder desanidarlo.

tamaño =1000
fragmentos = [df_users_items1[i:i + tamaño] for i in range(0, len(df_users_items1), tamaño)]

In [None]:
# Funcion para desanidar

def desanidar_items(df):

    items = df["items"].explode()
    df = df.drop("items", axis=1)
    df = pd.concat([df, items.apply(pd.Series)], axis=1)

    return df

In [None]:
# Llamamos a la funcion
fragmentos_desanidados = [desanidar_items(fragmento) for fragmento in fragmentos]

In [None]:
# Concatenar los fragmentos en un DataFrame
df_items_desanidado = pd.concat(fragmentos_desanidados)

In [None]:
# Vemos el DataFrame desanidado
df_items_desanidado

In [None]:
# Ordenamos el indice del DataFrame
df_items_desanidado.reset_index(drop=True, inplace=True)

In [None]:
# Guardamos el DataFrame en un archivo .csv

df_items_desanidado.to_csv("users_items_ETL.csv", index=False)

- Teniamos originalmente: 88.310 filas y 5 columnas.
- Ahora tenemos: 5.170.015 filas y 9 columnas.

## **<font color="#ff5653">user_reviews.json.gz</font>**

In [3]:
# Funcion para descomprimir y abrir
def user_reviews(archivo_json_gz):
    with gzip.open(archivo_json_gz, 'rt',encoding="utf-8") as file:
        return[ast.literal_eval(line.strip()) for line in file]

In [4]:
# Llamamos a la funcion
df= user_reviews("/content/drive/MyDrive/Datasets_Steam/user_reviews.json.gz")

In [5]:
# Creamos el DataFrame
df_user_reviews=pd.DataFrame(df)

Exploratory Data Analysis (EDA) - Análisis Exploratorio de Datos

In [6]:
# Vemos el DataFrame
df_user_reviews

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [7]:
# Informacion de no nulos y tipos de datos.
df_user_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


In [8]:
# Suma de valores nulos (NaN) en cada columna
df_user_reviews.isna().sum()

user_id     0
user_url    0
reviews     0
dtype: int64

In [9]:
# Cantidad de filas y columnas
df_user_reviews.shape

(25799, 3)

Extract, Transform and Load (ETL) - Extracción, Transformación y Carga.

In [10]:
# Copiamos el dataframe
df_user_reviews1 = df_user_reviews.copy()

In [11]:
# Cada elemento de la lista en la columna "reviews" se convertirá en una fila separada
df_user_reviews_explode = df_user_reviews1.explode("reviews")

In [12]:
df_user_reviews_explode

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted November 5, 20..."
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted July 15, 2011...."
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted April 21, 2011..."
1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted June 24, 2014...."
1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted September 8, 2..."
...,...,...,...
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"{'funny': '', 'posted': 'Posted July 10.', 'la..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"{'funny': '', 'posted': 'Posted July 8.', 'las..."
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,"{'funny': '1 person found this review funny', ..."
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,"{'funny': '', 'posted': 'Posted July 20.', 'la..."


In [13]:
# Normalizacion
df_user_reviews_normalize = pd.json_normalize(df_user_reviews_explode['reviews'])

In [14]:
df_user_reviews_normalize

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...
59328,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...
59329,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...
59330,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
59331,,Posted July 20.,,730,No ratings yet,True,:D


In [15]:
# Eliminamos los indices
df_user_reviews_explode.reset_index(drop=True, inplace=True)
df_user_reviews_normalize.reset_index(drop=True, inplace=True)

In [16]:
# Concatenamos los dos DataFrame
df_user_reviews_nuevo = pd.concat([df_user_reviews_explode.drop('reviews',axis=1),df_user_reviews_normalize], axis=1)

In [17]:
df_user_reviews_nuevo

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...,...,...
59328,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...
59329,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...
59330,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
59331,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,Posted July 20.,,730,No ratings yet,True,:D


In [18]:
# Informacion de no nulos y tipos de datos.
df_user_reviews_nuevo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59333 entries, 0 to 59332
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      59333 non-null  object
 1   user_url     59333 non-null  object
 2   funny        59305 non-null  object
 3   posted       59305 non-null  object
 4   last_edited  59305 non-null  object
 5   item_id      59305 non-null  object
 6   helpful      59305 non-null  object
 7   recommend    59305 non-null  object
 8   review       59305 non-null  object
dtypes: object(9)
memory usage: 4.1+ MB


In [19]:
# Suma de valores nulos (NaN) en cada columna
df_user_reviews_nuevo.isna().sum()

user_id         0
user_url        0
funny          28
posted         28
last_edited    28
item_id        28
helpful        28
recommend      28
review         28
dtype: int64

In [20]:
# Cantidad de filas y columnas
df_user_reviews_nuevo.shape

(59333, 9)

- Teniamos originalmente: 25.799 filas y 3 columnas.
- Ahora tenemos: 59.333 filas y 9 columnas.

In [21]:
# Eliminamos nulos y rellenar con 1: si solo eliminamos los nulos nos quedaremos sin datos para realizar el analisis de sentimiento.
df_review = df_user_reviews_nuevo.fillna(1)

In [22]:
df_review.shape

(59333, 9)

In [23]:
df_review.isna().sum()

user_id        0
user_url       0
funny          0
posted         0
last_edited    0
item_id        0
helpful        0
recommend      0
review         0
dtype: int64

## **<font color="#ff5653">Sentiment_analysis</font>**

Analisis de sentimiento con NLP

En el dataset user_reviews se incluyen reseñas de juegos hechos por distintos usuarios.

Vamos a crear una columna "sentiment_analysis" aplicando análisis de sentimiento con NLP -Natural Language Processing- con la siguiente escala: debe tomar el **valor "0" si es malo**, **"1" si es neutral** y **"2" si es positivo**.

Esta nueva columna debe reemplazar la columna "review" para facilitar el trabajo de los modelos de machine learning y el análisis de datos. De no ser posible este análisis por estar ausente la reseña escri

In [24]:
# Instalamos la biblioteca: VADER (Valence Aware Dictionary and sEntiment Reasoner).
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m122.9/126.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [25]:
# Importamos la biblioteca
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [26]:
sentimiento = SentimentIntensityAnalyzer()

In [27]:
# Puntuamos el texto
puntuacion_sentimiento = []

for texto in df_review['review']:
    if isinstance(texto, str):
        s = sentimiento.polarity_scores(texto)
        puntuacion_sentimiento.append(s['compound'])
    else:
        puntuacion_sentimiento.append(0.0)

In [28]:
puntuacion_sentimiento[:10]

[0.8481, 0.2263, 0.9117, 0.9899, 0.9958, 0.7713, -0.3839, 0.3313, 0.4767, 0.0]

In [29]:
df_review_x = df_review

In [30]:
# Agregamos al dataframe una columna "puntuacion_sentimiento"
df_review_x["puntuacion_sentimiento"] = puntuacion_sentimiento

In [31]:
df_review_x.head(2)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,puntuacion_sentimiento
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,0.8481
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,0.2263


Usamos la columna "puntuacion_sentimiento" para hacer la clasificacion.

- sentimiento positivo: puntuacion >= 0.05
- sentimiento netro: puntuacion entre -0.05 a 0.05
- sentimiento negativo: puntuacion <= -0.05

In [32]:
# Analisis de sentimiento
analisis_sentimiento = []

for puntuacion in df_review_x["puntuacion_sentimiento"]:
    if puntuacion >= 0.05:
        s = 2
    elif puntuacion > -0.05 and puntuacion < 0.05:
        s = 1
    else:
        s = 0
    analisis_sentimiento.append(s)

In [33]:
# Agregamos al dataframe una columna "analisis_sentimiento"
df_review_x["analisis_sentimiento"] = analisis_sentimiento

In [34]:
df_review_x.head(2)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,puntuacion_sentimiento,analisis_sentimiento
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,0.8481,2
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,0.2263,2


In [35]:
# Eliminamos columnas que no vamos a usar
columnas_eliminar = ["review","puntuacion_sentimiento"]

df_user_reviews_limpio = df_review_x.drop(columns=columnas_eliminar)

In [36]:
df_user_reviews_limpio.head(2)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,analisis_sentimiento
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,2
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,2


In [37]:
df_user_reviews_limpio.shape

(59333, 9)

In [41]:
# Guardamos todos los cambios en un archivo .csv
df_user_reviews_limpio.to_csv("user_reviews_ETL.csv", index=False)
