# Análisis de sentimientos

Importaciones

In [1]:
import pandas as pd
import re
import nltk as nlt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nlt.download("vader_lexicon")
import pyarrow as pa
import pyarrow.parquet as pq

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Gerard2\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


En el dataset user_reviews se incluyen reseñas de juegos hechos por distintos usuarios. </br>
Debes crear la columna 'sentiment_analysis' aplicando análisis de sentimiento con NLP con la siguiente escala: debe tomar los valores de: </br>
'0' si es malo</br>
'1' si es neutral </br>
'2' si es positivo </br>
Esta nueva columna debe reemplazar la de user_reviews.review para facilitar el trabajo de los modelos de machine learning y el análisis de datos. De no ser posible este análisis por estar ausente la reseña escrita, debe tomar el valor de 1.

Carga de datos

In [2]:
df = pd.read_csv('../Data/df_reviews.csv')

Visualización de datos

In [3]:
df.head()

Unnamed: 0,user_id,item_id,recommend,posted_year,sentiment_score
0,76561197970982479,1250,True,2011,2
1,76561197970982479,22200,True,2011,2
2,76561197970982479,43110,True,2011,2
3,js41637,251610,True,2014,2
4,js41637,227300,True,2013,2


Tratamiento para el modelo

In [None]:
df['review'] = df['review'].astype('str') # Convertir el tipo de dato a string
df['review'] = df['review'].apply(lambda x: re.sub(r'[^a-zA-Z]', '', x.lower())) # Eliminar caracteres no válidos

In [None]:
#Inicializar el modelo
sia = SentimentIntensityAnalyzer()

# Función para asignar valores según la escala
def get_sentiment_score(text):
    if pd.isnull(text) or text == '':
        return 1  # Devuelve neutral si está vacía o NaN
    elif isinstance(text, str):
        sentiment = sia.polarity_scores(text)
        compound_score = sentiment['compound']
        if compound_score >= -0.05:
            return 2  # Bueno puntuacion
        elif compound_score <= -0.05:
            return 0  # Mala puntuacion
        else:
            return 1
    else:
        return 1  # Devuelve neutral para valores que no son cadenas
    


# Convertir la columna 'review' a tipo de dato str
    
df['review'] = df['review'].astype(str)

# Aplicar la función get_sentiment_score a la columna 'review'
df['sentiment_score'] = df['review'].apply(get_sentiment_score)

# eliminar la columna 'review'

df = df.drop('review', axis=1)

Cuento los nulos de cada columa para ver como tratar los datos

In [None]:
df.isnull().sum() 

user_id             0
item_id            28
recommend          28
posted_year         0
sentiment_score     0
dtype: int64

Elimina filas nulas

In [None]:
df = df.dropna(how='any', axis=0)
df = df.reset_index(drop=True)

Vista final

In [None]:
df

Unnamed: 0,user_id,item_id,recommend,posted_year,sentiment_score
0,76561197970982479,1250.0,True,2011,2
1,76561197970982479,22200.0,True,2011,2
2,76561197970982479,43110.0,True,2011,2
3,js41637,251610.0,True,2014,2
4,js41637,227300.0,True,2013,2
...,...,...,...,...,...
59300,76561198312638244,70.0,True,2014,2
59301,76561198312638244,362890.0,True,2014,2
59302,LydiaMorley,273110.0,True,2014,2
59303,LydiaMorley,730.0,True,2014,2


# Tratamiento de datos para creacion de endpoints

Carga de datos

In [5]:
df_items = pd.read_csv('../Data/df_items.csv')
df_reviews = pd.read_csv('../Data/df_reviews.csv')
df_games = pd.read_csv('../Data/df_games.csv')

## Df Items

Información de datos

In [None]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3285246 entries, 0 to 3285245
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   item_id           int64  
 2   item_name         object 
 3   playtime_forever  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 100.3+ MB


Cambio de tipo de datos

In [None]:
df_items['user_id'] = df_items['user_id'].astype('str')
df_items['item_id'] = df_items['item_id'].astype('Int64')
df_items['item_name'] = df_items['item_name'].astype('str')
df_items['playtime_forever'] = df_items['playtime_forever'].astype('Int64')

Modifico el texto de la columna 'item_name' para que solo la primera letra de cada palabra sea mayúscula

In [None]:
df_items['item_name'] = df_items['item_name'].str.capitalize()

In [None]:
df_items.sample(10)

Unnamed: 0,user_id,item_id,item_name,playtime_forever
156695,xexilex,206420,Saints row iv,1345
2423733,76561198059923613,218230,Planetside 2,2872
2914088,76561198061343940,219640,Chivalry: medieval warfare,140
3131203,76561198081586883,247830,Aerena,72
2868302,76561198057047223,340,Half-life 2: lost coast,10
1769751,Jaydosity,261030,The walking dead: season two,197
1786200,JohnTheDonut,254320,Duskers,608
2038159,Peaceproducts131,227680,Starforge,105
402032,Hybrid2000,302650,Cyto,182
1798764,76561198066088271,239030,"Papers, please",313


## Df Reviews

Información de datos

In [None]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59305 entries, 0 to 59304
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   user_id          59305 non-null  object 
 1   item_id          59305 non-null  float64
 2   recommend        59305 non-null  object 
 3   posted_year      59305 non-null  int64  
 4   sentiment_score  59305 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 2.3+ MB


Cambio de tipo de datos

In [None]:
df_reviews['user_id'] = df_reviews['user_id'].astype('str')
df_reviews['item_id'] = df_reviews['item_id'].astype('Int64')
df_reviews['recommend'] = df_reviews['recommend'].astype(str)

In [None]:
df_reviews[df_reviews['recommend'] == '1']
df_reviews.head(10)

Unnamed: 0,user_id,item_id,recommend,posted_year,sentiment_score
0,76561197970982479,1250,True,2011,2
1,76561197970982479,22200,True,2011,2
2,76561197970982479,43110,True,2011,2
3,js41637,251610,True,2014,2
4,js41637,227300,True,2013,2
5,js41637,239030,True,2013,2
6,evcentric,248820,True,2014,2
7,evcentric,370360,True,2015,2
8,evcentric,237930,True,2014,2
9,evcentric,263360,True,2014,2


## Df Games

Información de datos

In [None]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22530 entries, 0 to 22529
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   title                      22530 non-null  object
 1   id                         22530 non-null  int64 
 2   developer                  22530 non-null  object
 3   Action                     22530 non-null  int64 
 4   Adventure                  22530 non-null  int64 
 5   Animation &amp; Modeling   22530 non-null  int64 
 6   Audio Production           22530 non-null  int64 
 7   Casual                     22530 non-null  int64 
 8   Design &amp; Illustration  22530 non-null  int64 
 9   Early Access               22530 non-null  int64 
 10  Education                  22530 non-null  int64 
 11  Free to Play               22530 non-null  int64 
 12  Indie                      22530 non-null  int64 
 13  Massively Multiplayer      22530 non-null  int64 
 14  Photo 

Cambio de tipo de datos

In [None]:
df_games['title'] = df_games['title'].astype('str')
df_games['developer'] = df_games['developer'].astype('str')

In [None]:
df_games

Unnamed: 0,title,id,developer,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,Design &amp; Illustration,Early Access,...,RPG,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing,year
0,Lost Summoner Kitty,761140,Kotoshiro,1,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,2018
1,Ironbound,643980,Secret Level SRL,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,2018
2,Real Pool 3D - Poolians,670290,Poolians.com,0,0,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,2017
3,弹炸人2222,767400,彼岸领域,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,2017
4,Battle Royale Trainer,772540,Trickjump Games Ltd,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22525,Kebab it Up!,745400,Bidoniera Games,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,2018
22526,Colony On Mars,773640,"Nikita ""Ghost_RUS""",0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,2018
22527,LOGistICAL: South Africa,733530,Sacada,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,2018
22528,Russian Roads,610660,Laush Dmitriy Sergeevich,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,2018


# Merg de datos

Renombro columas para merge


In [8]:
df_games = df_games.rename(columns={'title':'item_name','id':'item_id'})

Creacion de dataframes

In [9]:
merg1 = pd.merge(df_games, df_reviews, on=['item_id'], how='inner')
merg2 = pd.merge(df_games,df_items, on=['item_name'])

## Guardar los datos en archivos csv

In [10]:
dfs = [merg1,merg2]

names = ['df_games_reviews','df_games_items']

for df, name in zip(dfs, names):
    archivo = f'../Data/{name}.csv'
    df.to_csv(archivo, index=False,encoding='utf-8')
    print(f"DataFrame '{name}' guardado como '{archivo}'")

DataFrame 'df_games_reviews' guardado como '../Data/df_games_reviews.csv'
DataFrame 'df_games_items' guardado como '../Data/df_games_items.csv'
