In [1]:
#Imports 

import requests
from bs4 import BeautifulSoup
from spellchecker import SpellChecker
import pandas as pd
import json
import time
import nltk
import spacy
import re
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException


In [None]:
nltk.download('stopwords')

In [13]:
stopwords = nltk.corpus.stopwords.words('english')

# Cargar el modelo de Spacy en inglés
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Cargar los datos
data = pd.read_csv('data/games_reviews_df.csv')

# Definir las etiquetas permitidas (sustantivos, adjetivos, verbos, adverbios)
allowed_tags = ['NOUN', 'ADJ', 'VERB', 'ADV']

# Crear lista para almacenar las palabras que no se permiten según su parte de la oración
drop_out = []

# Corrector ortográfico para inglés
speller = SpellChecker(language='en')

In [14]:
display(data)

Unnamed: 0,AppID,ReviewID,UserID,Language,Review,Positive,Bought,Free
0,578080,171886441,76561198150195741,english,don' recommend,False,True,False
1,578080,137852244,76561198871670200,english,"The past couple of months, PUBG has been impro...",False,True,False
2,578080,171878328,76561198105543998,english,"Great game, highly recommend",True,True,False
3,578080,171877513,76561198011887521,english,Classic,True,True,False
4,578080,79647082,76561198390827775,english,돈 주고 뽑은 애스턴마틴만 타면 대가리 다터지는 병신같은게임,False,True,False
...,...,...,...,...,...,...,...,...
108119,1972440,155655678,76561198039515791,english,This game just isn't fun after the first hour....,False,False,False
108120,1972440,155643442,76561197995272591,english,I didn't think I would enjoy this little gem a...,True,False,False
108121,1972440,155512889,76561198147486516,english,help I'm being held at gun point to write a go,True,False,False
108122,1972440,155483819,76561197971306794,english,5/10 - You and your opponent get a perk after ...,False,False,False


In [21]:
data['Review'] = data['Review'].fillna('').astype('string')

In [22]:
def is_informative(sentence):
    # Elimina emojis y caracteres especiales
    sentence_clean = re.sub(r'[^\w\s]', '', sentence)  # Elimina todo excepto letras y espacios
    # Tokenizar las palabras
    words = nltk.word_tokenize(sentence_clean)
    # Filtrar las palabras que no son stopwords
    meaningful_words = [word for word in words if word.lower() not in stopwords]
    # Determinar si el comentario tiene una cantidad mínima de palabras significativas
    return len(meaningful_words) >= 3  # Puedes ajustar el umbral

# Aplicar el filtro de comentarios no informativos
data['is_informative'] = data['Review'].apply(is_informative)

# Filtrar solo los comentarios que son informativos
data_filtered = data[data['is_informative'] == True]

In [24]:
data_filtered.count()

AppID             84896
ReviewID          84896
UserID            84896
Language          84896
Review            84896
Positive          84896
Bought            84896
Free              84896
is_informative    84896
dtype: int64

In [26]:
# Función de preprocesamiento
def preprocess_function(sentence):
    sentence_tokenize = nltk.word_tokenize(sentence)  # Tokenizar la oración
    sentenze_lemma = [word.lemma_ if word.pos_ in allowed_tags else drop_out.append((word, word.pos_)) for word in nlp(" ".join(sentence_tokenize))]
    token_part_of_speech = list(filter(None, sentenze_lemma))  # Filtrar valores None
    token_part_of_speech_remove_stopwords = [word for word in token_part_of_speech if word not in stopwords]  # Eliminar stopwords
    tokenizer_str = " ".join(token_part_of_speech_remove_stopwords)  # Unir las palabras en una oración preprocesada
    return tokenizer_str

# Función de corrección ortográfica
def sentence_corrector(sentence):
    sentence_list = sentence.split(" ")  # Dividir la oración en palabras
    list_correct_words = [speller.correction(word) if speller.correction(word) is not None else word for word in sentence_list]  # Corregir las palabras
    list_correct_words_str = " ".join(list_correct_words)  # Unir las palabras corregidas en una oración
    return list_correct_words_str

# Aplicar el preprocesamiento y corrección ortográfica a los datos
data_filtered['review_preprocesada_aux'] = data_filtered['Review'].apply(preprocess_function)
data_filtered['review_preprocesada'] = data_filtered['review_preprocesada_aux'].apply(sentence_corrector)

# Guardar los datos preprocesados en un archivo csv
data_filtered.to_csv('data/games_reviews_preprocesado.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['review_preprocesada_aux'] = data_filtered['Review'].apply(preprocess_function)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['review_preprocesada'] = data_filtered['review_preprocesada_aux'].apply(sentence_corrector)


SEGUNDO FILTRO (IDIOMAS NO COINCIDENTES)

In [2]:
data_filtered = pd.read_csv('data/games_reviews_preprocesado.csv')

In [3]:
display(data_filtered)
# Es facil ver que el idioma no es coincidente asi que se hace una segunda detección de idioma

Unnamed: 0,AppID,ReviewID,UserID,Language,Review,Positive,Bought,Free,is_informative,review_preprocesada_aux,review_preprocesada
0,578080,137852244,76561198871670200,english,"The past couple of months, PUBG has been impro...",False,True,False,True,past couple month improve ban hacker problem s...,past couple month improve ban hacker problem s...
1,578080,171878328,76561198105543998,english,"Great game, highly recommend",True,True,False,True,great game highly recommend,great game highly recommend
2,578080,79647082,76561198390827775,english,돈 주고 뽑은 애스턴마틴만 타면 대가리 다터지는 병신같은게임,False,True,False,True,주고 대가리 병신같은게임,i 대가리 병신같은게임
3,578080,171873488,76561198856413389,english,lately every single day our squad looses a mem...,True,True,False,True,lately single day squad loose member loading p...,lately single day squad loose member loading p...
4,578080,171871227,76561198058890733,english,Chinese goyslop that limits how many games you...,False,True,False,True,chinese goyslop limit many game play day,chines toyshop limit many game play day
...,...,...,...,...,...,...,...,...,...,...,...
84891,1972440,155655678,76561198039515791,english,This game just isn't fun after the first hour....,False,False,False,True,game fun first hour mechanic inconsistent game...,game fun first hour mechanic inconsistent game...
84892,1972440,155643442,76561197995272591,english,I didn't think I would enjoy this little gem a...,True,False,False,True,think enjoy little gem much like least know pi...,think enjoy little gem much like least know pi...
84893,1972440,155512889,76561198147486516,english,help I'm being held at gun point to write a go,True,False,False,True,help ' hold gun point write go,help ' hold gun point write go
84894,1972440,155483819,76561197971306794,english,5/10 - You and your opponent get a perk after ...,False,False,False,True,opponent get perk board win fun game depend he...,opponent get perk board win fun game depend he...


In [4]:
# Asegurar resultados consistentes
DetectorFactory.seed = 0

# Función para detectar el idioma de cada comentario
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"  # Si no puede detectar el idioma, asignamos 'unknown'

# Aplicar la detección de idioma a la columna 'Review'
data_filtered['detected_language'] = data_filtered['Review'].apply(detect_language)

# Filtrar solo las reseñas detectadas como inglés ('en')
filtered_data = data_filtered[data_filtered['detected_language'] == 'en']

# Guardar el dataset filtrado
filtered_data.to_csv('data//games_reviews_filtered_english.csv', index=False)



Se han eliminado 0 comentarios no en inglés.


In [5]:
print(f"Se han eliminado {len(data_filtered) - len(filtered_data)} comentarios no en inglés.")

Se han eliminado 5851 comentarios no en inglés.
