In [1]:
import pandas as pd
import ast
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob

In [2]:

# Ruta al archivo JSON
ruta = r'C:\Users\Moises\Desktop\Projecto invidual 1\archivos crudos\australian_user_reviews.json'

# Lista para almacenar los datos del archivo JSON
data = []

# Abrir el archivo JSON y procesar cada línea
with open(ruta, 'r', encoding='utf-8') as file:
    for linea in file:
        try:
            # Convertir cada línea del archivo JSON en un diccionario utilizando ast.literal_eval
            json_datos = ast.literal_eval(linea)
            # Agregar el diccionario a la lista de datos
            data.append(json_datos)
        except ValueError as e:
            # Manejar errores de formato en el JSON
            print(f"Error en la línea: {linea}")
            continue

In [3]:
# Explotar la columna 'reviews' del DataFrame df_reviews

df_reviews = pd.DataFrame(data)

In [4]:
df_reviews.head()

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."


In [5]:
df_review_full=df_reviews.explode('reviews')

In [6]:
df_review_full.head()

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted November 5, 20..."
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted July 15, 2011...."
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted April 21, 2011..."
1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted June 24, 2014...."
1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted September 8, 2..."


In [7]:
df_review_full=pd.concat([df_review_full.drop(['reviews'],axis= 1), df_review_full['reviews'].apply(pd.Series)], axis = 1)
# Aplicar pd.Series() a la columna 'reviews' para expandirla en columnas individuales
# Eliminar la columna original 'reviews' del DataFrame df_review_full y concatenar las nuevas columnas

In [8]:
# Renombrar la primera columna del DataFrame df_review_full como 'sentiment_analysis'
df_review_full.rename(columns= {0: 'sentiment_analysis'}, inplace= True)

In [9]:
# Eliminar las columnas especificadas del DataFrame df_review_full
df_review_full.drop(['funny', 'posted', 'last_edited', 'helpful', 'user_url'], axis=1, inplace=True)

In [10]:
# Asignar el valor 0 a la columna 'sentiment_analysis' en el DataFrame df_review_full
df_review_full['sentiment_analysis'] = 0

In [11]:
df_review_full.head()

Unnamed: 0,user_id,item_id,recommend,review,sentiment_analysis
0,76561197970982479,1250,True,Simple yet with great replayability. In my opi...,0
0,76561197970982479,22200,True,It's unique and worth a playthrough.,0
0,76561197970982479,43110,True,Great atmosphere. The gunplay can be a bit chu...,0
1,js41637,251610,True,I know what you think when you see this title ...,0
1,js41637,227300,True,For a simple (it's actually not all that simpl...,0


In [15]:
# Eliminar las filas donde el valor de la columna 'item_id' es nulo en el DataFrame df_review_full

df_review_full.dropna(subset='item_id', inplace=True)

In [16]:
def sentiment_score(review:str) -> int:
    """
    Calcula un puntaje de sentimiento para una revisión dada.

    Parameters:
    review (str): El texto de la revisión.

    Returns:
    int: El puntaje de sentimiento (0, 1 o 2).
    """
    # Si la revisión está vacía, devuelve un puntaje neutral (1)
    if not review:
        return 1
    else:
                # Realiza análisis de sentimientos en la revisión usando TextBlob

        analisis = TextBlob(review)
                # Evalúa el puntaje de polaridad del análisis de sentimientos

        if analisis.sentiment.polarity < -0.2:
            return 0  #sentimiento negativo
        elif analisis.sentiment.polarity > 0.2:
            return 2  #sentimiento positivo
        else:
            return 1  #sentimiento neutro

In [17]:
# Aplicar la función sentiment_score a cada valor de la columna 'review' y asignar los resultados a una nueva columna 'sentiment_analysis'
df_review_full['sentiment_analysis']=df_review_full['review'].apply(sentiment_score)

In [18]:
df_review_full.head()

Unnamed: 0,user_id,item_id,recommend,review,sentiment_analysis
0,76561197970982479,1250,True,Simple yet with great replayability. In my opi...,1
0,76561197970982479,22200,True,It's unique and worth a playthrough.,2
0,76561197970982479,43110,True,Great atmosphere. The gunplay can be a bit chu...,1
1,js41637,251610,True,I know what you think when you see this title ...,1
1,js41637,227300,True,For a simple (it's actually not all that simpl...,1


In [19]:
# Eliminar la columna 'review' del DataFrame df_review_full

df_review_full.drop(columns='review', inplace= True)

In [20]:
# Guardar el DataFrame df_review_full en formato Parquet
df_review_full.to_parquet(r'C:\Users\Moises\Desktop\Projecto invidual 1\archivos limpios\reviews.parquet')