In [101]:
import pandas as pd 
import json
from ast import literal_eval

# Lista para almacenar las líneas convertidas en diccionarios
data_list = []

# Ruta de tu archivo JSON
json_file_path = 'australian_user_reviews.json'

# Leer el archivo línea por línea y convertir cada línea en un diccionario
with open(json_file_path, 'r' , encoding='utf-8' ) as file:
    for line in file:
        # Convertir la línea en un diccionario usando literal_eval
        data_dict = literal_eval(line)
        # Añadir el diccionario a la lista
        data_list.append(data_dict)

# Convertir la lista de diccionarios en un dataframe
df = pd.DataFrame(data_list)

# Mostrar el dataframe resultante
print(df)


                 user_id                                           user_url  \
0      76561197970982479  http://steamcommunity.com/profiles/76561197970...   
1                js41637               http://steamcommunity.com/id/js41637   
2              evcentric             http://steamcommunity.com/id/evcentric   
3                  doctr                 http://steamcommunity.com/id/doctr   
4              maplemage             http://steamcommunity.com/id/maplemage   
...                  ...                                                ...   
25794  76561198306599751  http://steamcommunity.com/profiles/76561198306...   
25795           Ghoustik              http://steamcommunity.com/id/Ghoustik   
25796  76561198310819422  http://steamcommunity.com/profiles/76561198310...   
25797  76561198312638244  http://steamcommunity.com/profiles/76561198312...   
25798        LydiaMorley           http://steamcommunity.com/id/LydiaMorley   

                                                 re

In [102]:
df.columns

Index(['user_id', 'user_url', 'reviews'], dtype='object')

In [103]:
# Crear un dataframe vacío para almacenar las columnas
df_reviews_columns = pd.DataFrame()

# Iterar sobre cada diccionario en la lista 'reviews'
for index, row in df.iterrows():
    reviews_list = []
    for review_dict in row['reviews']:
        reviews_list.append(pd.Series(review_dict))
    temp_df = pd.DataFrame(reviews_list)
    df_reviews_columns = pd.concat([df_reviews_columns, temp_df], ignore_index=True)

# Mostrar el dataframe resultante
print(df_reviews_columns)

                                  funny                     posted  \
0                                         Posted November 5, 2011.   
1                                            Posted July 15, 2011.   
2                                           Posted April 21, 2011.   
3                                            Posted June 24, 2014.   
4                                        Posted September 8, 2013.   
...                                 ...                        ...   
59300                                              Posted July 10.   
59301                                               Posted July 8.   
59302  1 person found this review funny             Posted July 3.   
59303                                              Posted July 20.   
59304                                               Posted July 2.   

      last_edited item_id                                          helpful  \
0                    1250                                   No ratings yet   
1  

In [104]:
df_reviews_columns.columns

Index(['funny', 'posted', 'last_edited', 'item_id', 'helpful', 'recommend',
       'review'],
      dtype='object')

In [None]:
# Guardar el DataFrame en un archivo CSV
df_reviews_columns.to_csv('reviews.csv', index=False)

In [105]:
from textblob import TextBlob

def get_sentiment_with_recommend(recommend, review):
    if isinstance(review, str):  # Verifica si el tipo de review es string
        if recommend:  # Verifica si la recomendación es verdadera
            analysis = TextBlob(str(review))  # Convierte a cadena y analiza el texto de la reseña
            polarity = analysis.sentiment.polarity
            if polarity > 0:
                return 2  # Positivo si polaridad > 0
            else:
                return 1  # Neutral si polaridad <= 0 y recommend true
        else:
            return 0  # No recomendado, se asume neutral o negativo
    else:
        return 1  # No hay reseña, se asume neutral
# Convertir la columna 'review' a cadena (str)
df_reviews_columns['review'] = df_reviews_columns['review'].astype(str)

# Aplica la función a cada review
df_reviews_columns['sentiment_analysis'] = df_reviews_columns.apply(lambda x: get_sentiment_with_recommend(x['recommend'], x['review']), axis=1)

# Reemplaza la columna 'review' con la nueva columna 'sentiment_analysis'
df_reviews_columns.drop('review', axis=1, inplace=True)
df_reviews_columns.rename(columns={'sentiment_analysis': 'review'}, inplace=True)

print(df_reviews_columns)


                                  funny                     posted  \
0                                         Posted November 5, 2011.   
1                                            Posted July 15, 2011.   
2                                           Posted April 21, 2011.   
3                                            Posted June 24, 2014.   
4                                        Posted September 8, 2013.   
...                                 ...                        ...   
59300                                              Posted July 10.   
59301                                               Posted July 8.   
59302  1 person found this review funny             Posted July 3.   
59303                                              Posted July 20.   
59304                                               Posted July 2.   

      last_edited item_id                                          helpful  \
0                    1250                                   No ratings yet   
1  

In [94]:
df_reviews_columns.to_csv('reviews_sentiment_analysis.csv', index=False)

In [111]:
import pandas as pd

# Carga el dataset desde 'reviews_sentiment_analysis.csv'
df_reviews_columns = pd.read_csv('reviews_sentiment_analysis.csv')

# Define una función para extraer y convertir la fecha
def extract_date(posted_str):
    # Elimina la palabra 'Posted' y espacios en blanco al principio y al final
    date_str = posted_str.replace('Posted', '').replace('.', '').strip()
    try:
        # Intenta convertir la cadena de fecha en un objeto de fecha utilizando datetime.strptime
        date_obj = datetime.strptime(date_str, '%B %d, %Y').date()
    except ValueError:
        try:
            # Intenta nuevamente con el formato '%B %d' si la conversión falla
            date_obj = datetime.strptime(date_str, '%B %d').date()
        except ValueError:
            # Si no se puede convertir a ninguna de las dos formas, devuelve None
            date_obj = None

    # Si la fecha es válida pero tiene el año 1900, establece el año en None
    if date_obj and date_obj.year == 1900:
        date_obj = date_obj.replace(year=2016)
    
    return date_obj

# Aplica la función a la columna 'posted' para obtener la fecha
df_reviews_columns['date_posted'] = df_reviews_columns['posted'].apply(extract_date)

# Reemplaza la columna 'posted' con la nueva columna 'date_posted'
df_reviews_columns.drop('posted', axis=1, inplace=True)
df_reviews_columns.rename(columns={'date_posted': 'posted'}, inplace=True)

print(df_reviews_columns)



                                  funny last_edited  item_id  \
0                                   NaN         NaN     1250   
1                                   NaN         NaN    22200   
2                                   NaN         NaN    43110   
3                                   NaN         NaN   251610   
4                                   NaN         NaN   227300   
...                                 ...         ...      ...   
59300                               NaN         NaN       70   
59301                               NaN         NaN   362890   
59302  1 person found this review funny         NaN   273110   
59303                               NaN         NaN      730   
59304                               NaN         NaN      440   

                                               helpful  recommend  review  \
0                                       No ratings yet       True       2   
1                                       No ratings yet       True       2   


In [112]:
df_reviews_columns['posted']

0        2011-11-05
1        2011-07-15
2        2011-04-21
3        2014-06-24
4        2013-09-08
            ...    
59300    2016-07-10
59301    2016-07-08
59302    2016-07-03
59303    2016-07-20
59304    2016-07-02
Name: posted, Length: 59305, dtype: object

In [116]:
# Reemplaza los valores NaN por None en el DataFrame
df_reviews_columns = df_reviews_columns.where(df_reviews_columns.notna(), None)

# Muestra el DataFrame actualizado
print(df_reviews_columns)



                                  funny last_edited  item_id  \
0                                  None        None     1250   
1                                  None        None    22200   
2                                  None        None    43110   
3                                  None        None   251610   
4                                  None        None   227300   
...                                 ...         ...      ...   
59300                              None        None       70   
59301                              None        None   362890   
59302  1 person found this review funny        None   273110   
59303                              None        None      730   
59304                              None        None      440   

                                               helpful  recommend  review  \
0                                       No ratings yet       True       2   
1                                       No ratings yet       True       2   


In [117]:
df_reviews_columns.to_csv('reviews_sentiment_analysis.csv', index=False)