In [2]:
import pandas as pd
import ast
from textblob import TextBlob

In [3]:
# Leer datos de JSON e importar en dataframe
rows = []

with open ('australian_user_reviews.json', encoding='MacRoman') as f:
    for line in f.readlines():
        rows.append(ast.literal_eval(line))

df_reviews = pd.DataFrame(rows)

# df.to_csv('revision.csv', index=False)

In [4]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


In [5]:
df_reviews.head(1)

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."


Comprobando si hay valores nulos en el DataFrame

In [6]:
# Comprobando si hay valores nulos en el DataFrame
df_reviews.isnull().sum()

user_id     0
user_url    0
reviews     0
dtype: int64

Mostramos todo el contenido dentro de la columna 'reviews'

In [7]:
df_user_expanded_list = []

# Luego, procesamos cada fila del DataFrame original por separado
for idx, row in df_reviews.iterrows():
    # Expandimos la lista de 'items' en una fila en un DataFrame separado
    temp = pd.json_normalize(row['reviews'])
    # Agregamos 'user_id' al DataFrame temporal
    temp['user_id'] = row['user_id']
    # Agregamos el DataFrame temporal a nuestra lista de resultados
    df_user_expanded_list.append(temp)

# Finalmente, concatenamos todos los DataFrames temporales en uno solo
df_user_expanded = pd.concat(df_user_expanded_list, ignore_index=True)


In [8]:
print(df_user_expanded)

                                  funny                     posted  \
0                                         Posted November 5, 2011.   
1                                            Posted July 15, 2011.   
2                                           Posted April 21, 2011.   
3                                            Posted June 24, 2014.   
4                                        Posted September 8, 2013.   
...                                 ...                        ...   
59300                                              Posted July 10.   
59301                                               Posted July 8.   
59302  1 person found this review funny             Posted July 3.   
59303                                              Posted July 20.   
59304                                               Posted July 2.   

      last_edited item_id                                          helpful  \
0                    1250                                   No ratings yet   
1  

Comenzamos con la clumna posted

In [9]:
# Muestra el contenido de la columna "posted" en el DataFrame
print(df_user_expanded['posted'])


0         Posted November 5, 2011.
1            Posted July 15, 2011.
2           Posted April 21, 2011.
3            Posted June 24, 2014.
4        Posted September 8, 2013.
                   ...            
59300              Posted July 10.
59301               Posted July 8.
59302               Posted July 3.
59303              Posted July 20.
59304               Posted July 2.
Name: posted, Length: 59305, dtype: object


Cambiar el tipo de datos de la columna posted para datetime

In [14]:
from datetime import datetime

def convert_date(date_str):
    try:
        date = datetime.strptime(date_str, "Posted %B %d, %Y.")
    except ValueError:
        try:
            # Intenta convertir sin el año (usando un año ficticio, como 2000)
            date = datetime.strptime(date_str, "Posted %B %d.")
            date = date.replace(year=2000)
        except ValueError:
            # Si no se puede convertir, asigna una fecha nula
            date = None
    return date




In [15]:
df_user_expanded['posted'] = df_user_expanded['posted'].apply(convert_date)


Miosrtramos las columnas que necesitamos quwe son 'item_id', 'recommend', 'review'

In [16]:
# Seleccionar las columnas de interés
selected_columns = ['item_id', 'recommend', 'review', 'user_id', 'posted']	

# Crear un nuevo DataFrame que contenga solo las columnas seleccionadas
df_selected_columns = df_user_expanded[selected_columns]

# Imprimir el nuevo DataFrame con las columnas seleccionadas
print(df_selected_columns)

      item_id recommend                                             review  \
0        1250      True  Simple yet with great replayability. In my opi...   
1       22200      True               It's unique and worth a playthrough.   
2       43110      True  Great atmosphere. The gunplay can be a bit chu...   
3      251610      True  I know what you think when you see this title ...   
4      227300      True  For a simple (it's actually not all that simpl...   
...       ...       ...                                                ...   
59300      70      True  a must have classic from steam definitely wort...   
59301  362890      True  this game is a perfect remake of the original ...   
59302  273110      True  had so much fun plaing this and collecting res...   
59303     730      True                                                 :D   
59304     440      True                                     so much fun :D   

                 user_id     posted  
0      76561197970982479 

Buscar y eliminar filas duplicadas


In [17]:
# Eliminar filas duplicadas considerando todas las columnas
df_selected_columns.drop_duplicates()

# Imprimir la información del DataFrame resultant
df_selected_columns.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59305 entries, 0 to 59304
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   item_id    59305 non-null  object        
 1   recommend  59305 non-null  object        
 2   review     59305 non-null  object        
 3   user_id    59305 non-null  object        
 4   posted     59280 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 2.3+ MB


Revisamos la columna item_id

In [18]:
# Verificar si los valores en la columna "item_id" son de tipo string
is_string_column = df_selected_columns['item_id'].apply(lambda x: isinstance(x, str)).all()

# Imprimir el resultado
if is_string_column:
    print("La columna 'item_id' contiene valores de tipo string.")
else:
    print("La columna 'item_id' no contiene exclusivamente valores de tipo string.")



La columna 'item_id' contiene valores de tipo string.


Esto reemplazará todos los "True" por 1 y todos los "False" por 0 en la columna "recommend" del DataFrame 

In [19]:
df_selected_columns['recommend'] = df_selected_columns['recommend'].replace({True: 1, False: 0})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_columns['recommend'] = df_selected_columns['recommend'].replace({True: 1, False: 0})


Cambiamos el tipo de datos de la columna recomend a int

In [20]:
df_selected_columns['recommend'] = df_selected_columns['recommend'].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_columns['recommend'] = df_selected_columns['recommend'].astype(int)


In [21]:
# Seleccionar las columnas de interés
selected_columns = ['item_id', 'recommend', 'review', 'user_id', 'posted']

# Crear el nuevo DataFrame 'df_reviews' con las columnas seleccionadas
df_selected_columns[selected_columns].copy()

# Imprimir el nuevo DataFrame 'df_reviews'
print(df_selected_columns)

      item_id  recommend                                             review  \
0        1250          1  Simple yet with great replayability. In my opi...   
1       22200          1               It's unique and worth a playthrough.   
2       43110          1  Great atmosphere. The gunplay can be a bit chu...   
3      251610          1  I know what you think when you see this title ...   
4      227300          1  For a simple (it's actually not all that simpl...   
...       ...        ...                                                ...   
59300      70          1  a must have classic from steam definitely wort...   
59301  362890          1  this game is a perfect remake of the original ...   
59302  273110          1  had so much fun plaing this and collecting res...   
59303     730          1                                                 :D   
59304     440          1                                     so much fun :D   

                 user_id     posted  
0      765611

In [22]:
df_selected_columns

Unnamed: 0,item_id,recommend,review,user_id,posted
0,1250,1,Simple yet with great replayability. In my opi...,76561197970982479,2011-11-05
1,22200,1,It's unique and worth a playthrough.,76561197970982479,2011-07-15
2,43110,1,Great atmosphere. The gunplay can be a bit chu...,76561197970982479,2011-04-21
3,251610,1,I know what you think when you see this title ...,js41637,2014-06-24
4,227300,1,For a simple (it's actually not all that simpl...,js41637,2013-09-08
...,...,...,...,...,...
59300,70,1,a must have classic from steam definitely wort...,76561198312638244,2000-07-10
59301,362890,1,this game is a perfect remake of the original ...,76561198312638244,2000-07-08
59302,273110,1,had so much fun plaing this and collecting res...,LydiaMorley,2000-07-03
59303,730,1,:D,LydiaMorley,2000-07-20


Guardar df_reviews en un archivo .pkl

In [23]:
df_selected_columns.to_pickle('reviews.pkl')

# Esto guardará el DataFrame df_games en un archivo llamado "df_games.pkl" en el directorio actual.


In [24]:
df_selected_columns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59305 entries, 0 to 59304
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   item_id    59305 non-null  object        
 1   recommend  59305 non-null  int32         
 2   review     59305 non-null  object        
 3   user_id    59305 non-null  object        
 4   posted     59280 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int32(1), object(3)
memory usage: 2.0+ MB


In [10]:
# Función de analisis de sentimiento:

def detect_sentiment(text):
   analysis = TextBlob(text)
   if analysis.sentiment.polarity < 0:
      return 0 #negative
   elif analysis.sentiment.polarity == 0:
      return 1 #neutral
   else:
      return 2 #positive

Funcion analisis de sentimiento

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

nltk.download('vader_lexicon')

# Inicializa el analizador de sentimiento
sia = SentimentIntensityAnalyzer()

# Función para asignar etiquetas de sentimiento a valores numéricos
def get_sentiment_label(sentiment_score):
    if sentiment_score >= 0.05:
        return 2  # Positivo
    elif sentiment_score <= -0.05:
        return 0  # Malo
    else:
        return 1  # Neutral

# Función para analizar sentimiento y asignar etiquetas numéricas
def analyze_sentiment(text):
    sentiment_score = sia.polarity_scores(text)['compound']
    sentiment_label = get_sentiment_label(sentiment_score)
    return sentiment_label

# Aplica el análisis de sentimiento a la columna de reseñas y crea una nueva columna
df_user_expanded['sentiment_analysis'] = df_user_expanded['review'].apply(analyze_sentiment)