# Carga de Datos

In [2]:
import pandas as pd

import os
import numpy as np
import pandas as pd
from sklearn import tree
import sklearn as sklearn

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D # for 3D plots
import seaborn as sns; sns.set()

df = pd.read_csv("video_game_reviews.csv")
print(df.head())

           Game Title  User Rating Age Group Targeted  Price     Platform  \
0  Grand Theft Auto V         36.4           All Ages  41.41           PC   
1          The Sims 4         38.3             Adults  57.56           PC   
2           Minecraft         26.8              Teens  44.93           PC   
3   Bioshock Infinite         38.4           All Ages  48.29       Mobile   
4     Half-Life: Alyx         30.1             Adults  55.49  PlayStation   

  Requires Special Device   Developer        Publisher  Release Year  \
0                      No  Game Freak       Innersloth          2015   
1                      No    Nintendo  Electronic Arts          2015   
2                     Yes      Bungie           Capcom          2012   
3                     Yes  Game Freak         Nintendo          2015   
4                     Yes  Game Freak       Epic Games          2022   

       Genre Multiplayer  Game Length (Hours) Graphics Quality  \
0  Adventure          No              

# Exploración de Datos

In [3]:
#Completitud

df.isnull().sum()

print ( df.isnull().sum()/len(df)*100)


Game Title                 0.0
User Rating                0.0
Age Group Targeted         0.0
Price                      0.0
Platform                   0.0
Requires Special Device    0.0
Developer                  0.0
Publisher                  0.0
Release Year               0.0
Genre                      0.0
Multiplayer                0.0
Game Length (Hours)        0.0
Graphics Quality           0.0
Soundtrack Quality         0.0
Story Quality              0.0
User Review Text           0.0
Game Mode                  0.0
Min Number of Players      0.0
dtype: float64


In [4]:
df.shape

(47774, 18)

No hay datos vacios o nulos

## Unicidad

In [6]:
df.duplicated(keep=False).sum()


0

No hay ninguno duplicado

## Consistencia

In [8]:
import pandas as pd
import numpy as np
import re, unicodedata
from datetime import datetime

# ========= Config =========
PATH = "video_game_reviews.csv"   # cambia si es otra ruta

# Posibles nombres de columnas (ajusta si tu CSV usa otros)
COL_TITLE    = next((c for c in ["Game Title","title","game","name"] if c in pd.read_csv(PATH, nrows=0).columns), None)
COL_REVIEW   = next((c for c in ["Review Text","review_text","review","text"] if c in pd.read_csv(PATH, nrows=0).columns), None)
COL_RATING   = next((c for c in ["User Rating","rating","score","user_score"] if c in pd.read_csv(PATH, nrows=0).columns), None)
COL_PLATFORM = next((c for c in ["Platform","platform"] if c in pd.read_csv(PATH, nrows=0).columns), None)
COL_GENRE    = next((c for c in ["Genre","genre"] if c in pd.read_csv(PATH, nrows=0).columns), None)
COL_YEAR     = next((c for c in ["Release Year","year","released_year"] if c in pd.read_csv(PATH, nrows=0).columns), None)
COL_PUB      = next((c for c in ["Publisher","publisher"] if c in pd.read_csv(PATH, nrows=0).columns), None)
COL_DEV      = next((c for c in ["Developer","developer"] if c in pd.read_csv(PATH, nrows=0).columns), None)

df = pd.read_csv(PATH)

# ========= Helpers =========
def fix_mojibake(s):
    if not isinstance(s, str): return s
    if any(ch in s for ch in ["Ã","â","Â","ð"]):
        try:
            return s.encode("latin1").decode("utf-8")
        except Exception:
            return s
    return s

def strip_accents(s):
    if not isinstance(s, str): return s
    return unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("utf-8")

def norm_title(s):
    if not isinstance(s, str): return None
    s = fix_mojibake(s)
    s = strip_accents(s).lower()
    s = re.sub(r"\s*\(.*?\)", "", s)
    s = re.sub(r"[^a-z0-9]+", " ", s)
    s = re.sub(r"\b(edition|remastered|hd|ultimate|complete|definitive|remake|collection)\b", "", s)
    return re.sub(r"\s+", " ", s).strip()

today_year = datetime.now().year

KNOWN_PLATFORMS = {
    # Consolas y familias comunes (no exhaustivo)
    "pc","mac","linux","ios","android",
    "nes","snes","n64","gamecube","wii","wii u","nintendo switch","switch","ds","3ds","game boy","gba",
    "playstation","ps","ps1","ps2","ps3","ps4","ps5","psp","ps vita","vita",
    "xbox","xbox 360","x360","xbox one","xbox series x","xbox series s","xbox series s/x","xsx","xss",
    "atari 2600","2600","dreamcast","saturn","mega drive","genesis","amiga"
}

KAGGLE_GENRES = {
    "action","sports","role-playing","rpg","racing","shooter","adventure",
    "fighting","platform","platformer","misc","simulation","strategy","puzzle"
}

# ========= Títulos / Encoding =========
if COL_TITLE:
    df["_title_fixed"] = df[COL_TITLE].apply(fix_mojibake)
    df["_title_norm"]  = df["_title_fixed"].apply(norm_title)
    df["_title_mojibake"] = df[COL_TITLE].astype(str).str.contains("Ã|â|Â|ð", regex=True, na=False)

# ========= Ratings: escala y rangos =========
if COL_RATING:
    r = pd.to_numeric(df[COL_RATING], errors="coerce")
    df["_rating_raw"] = r

    # detectar escala probable
    r_min, r_max = r.min(skipna=True), r.max(skipna=True)
    if pd.notna(r_max):
        if r_max <= 5:      scale = 5
        elif r_max <= 10:   scale = 10
        elif r_max <= 100:  scale = 100
        else:               scale = None
    else:
        scale = None

    # flags de fuera de rango según escala detectada
    if scale == 5:
        df["_rating_out_of_range"] = (r < 0) | (r > 5)
    elif scale == 10:
        df["_rating_out_of_range"] = (r < 0) | (r > 10)
    elif scale == 100:
        df["_rating_out_of_range"] = (r < 0) | (r > 100)
    else:
        df["_rating_out_of_range"] = pd.NA  # escala desconocida

# ========= Plataformas =========
def platform_ok(val):
    if not isinstance(val, str): return pd.NA
    txt = strip_accents(val).lower()
    # permitir múltiples separadas por coma
    items = [x.strip() for x in re.split(r"[,/]| and ", txt) if x.strip()]
    if not items: return pd.NA
    # si cualquiera pertenece al dominio conocido, lo consideramos OK
    return any(x in KNOWN_PLATFORMS for x in items)

if COL_PLATFORM:
    df["_platform_valid"] = df[COL_PLATFORM].apply(platform_ok)

# ========= Género =========
def genre_ok(val):
    if not isinstance(val, str): return pd.NA
    g = strip_accents(val).lower().strip()
    # permitir que 'rpg' valide 'role-playing' y viceversa
    if g == "rpg": g = "role-playing"
    if g == "platform": g = "platformer"  # ajuste común Kaggle
    return g in KAGGLE_GENRES

if COL_GENRE:
    df["_genre_valid"] = df[COL_GENRE].apply(genre_ok)

# ========= Año de lanzamiento =========
if COL_YEAR:
    y = pd.to_numeric(df[COL_YEAR], errors="coerce")
    df["_year_invalid"]   = y.isna()
    df["_year_too_old"]   = y < 1970
    df["_year_in_future"] = y > today_year

# ========= Texto de reseña (longitud básica) =========
if COL_REVIEW:
    df["_review_empty"] = df[COL_REVIEW].astype(str).str.strip().eq("") | df[COL_REVIEW].isna()
    df["_review_short"] = df[COL_REVIEW].astype(str).str.len().fillna(0) < 20  # umbral simple



# ========= Resumen en consola =========
def count_true(s):
    return int(s.fillna(False).sum()) if s is not None else 0

print("=== RESUMEN DE INCONSISTENCIAS (sin corregir) ===")
if COL_TITLE:
    print("Mojibake en títulos:", count_true(df["_title_mojibake"]))
if COL_RATING:
    print("Escala detectada para rating:", scale, "| fuera de rango:", count_true(df["_rating_out_of_range"]))
if COL_PLATFORM:
    print("Plataformas no válidas:", int((df["_platform_valid"]==False).sum(skipna=True)))
if COL_GENRE:
    print("Géneros no válidos:", int((df["_genre_valid"]==False).sum(skipna=True)))
if COL_YEAR:
    print("Año inválido (NaN):", count_true(df["_year_invalid"]))
    print("Año < 1970       :", count_true(df["_year_too_old"]))
    print("Año en futuro    :", count_true(df["_year_in_future"]))
if COL_REVIEW:
    print("Reseña vacía:", count_true(df["_review_empty"]))
    print("Reseña corta:", count_true(df["_review_short"]))



=== RESUMEN DE INCONSISTENCIAS (sin corregir) ===
Mojibake en títulos: 0
Escala detectada para rating: 100 | fuera de rango: 0
Plataformas no válidas: 9589
Géneros no válidos: 4748
Año inválido (NaN): 0
Año < 1970       : 0
Año en futuro    : 0


Durante la evaluación del dataset de reseñas (“Video Game Reviews and Ratings”, Kaggle) se identificaron
inconsistencias relevantes en **plataformas**, **géneros** y **atribución de publishers/developers**, además de
diferencias de taxonomía respecto a nuestro dataset principal. Dado que estas discrepancias afectan la validez
en el contexto del problema (ventas históricas y metadatos confiables por título), **se decidió no integrar
ese dataset de reseñas en el análisis principal**.

En su lugar, trabajaremos con **Kaggle vgsales** (ventas históricas) —combinado con los metadatos de **RAWG**
que ya normalizamos—, por ser fuentes **más estables y coherentes** para:
- **Hechos de negocio:** `Global_Sales` (recalculado desde regionales) y ventas por región.
- **Dimensiones clave:** `Genre` (Kaggle) y `platforms_names` (RAWG) ya estandarizadas.

## Validez

In [10]:
# ============================================================
# Validez según diccionario – video_game_reviews (solo valida)
# No corrige datos: crea banderas por columna y un resumen.
# ============================================================
import pandas as pd
import numpy as np
import re, unicodedata
from datetime import datetime

PATH = "video_game_reviews.csv"   # ajusta si es otra ruta
df = pd.read_csv(PATH)

# -------- Helpers ----------
def strip_accents(s):
    if not isinstance(s, str): return s
    return unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("utf-8")

def is_mojibake(s: str) -> bool:
    if not isinstance(s, str): return False
    return bool(re.search(r"[ÃâÂð]", s))

def to_num(s):
    return pd.to_numeric(s, errors="coerce")

def norm_lower(s):
    return str(s).strip().lower() if isinstance(s, str) else s

def split_multi(val):
    # separa por coma, slash o 'and'
    if not isinstance(val, str): return []
    return [x.strip() for x in re.split(r"[,/]| and ", val) if x.strip()]

# -------- Mapeo flexible de columnas ----------
cols = df.columns.str.lower()
def pick(*cands):
    for c in cands:
        if c.lower() in cols: 
            return df.columns[cols.get_loc(c.lower())]
    return None

COL_TITLE    = pick("game title","title","game","name")
COL_RATING   = pick("user rating","rating","score","user_score")
COL_AGE      = pick("age group targeted","age group","age_group")
COL_PRICE    = pick("price","price_usd")
COL_PLATFORM = pick("platform","platforms","platform(s)")
COL_REQ_DEV  = pick("requires special device","requires device","special device","vr required","vr_required")
COL_DEV      = pick("developer")
COL_PUB      = pick("publisher")
COL_YEAR     = pick("release year","year","released_year")
COL_GENRE    = pick("genre")

# -------- 1) Game Title ----------
if COL_TITLE:
    df["_title_null"]     = df[COL_TITLE].isna() | (df[COL_TITLE].astype(str).str.strip()=="")
    df["_title_mojibake"] = df[COL_TITLE].astype(str).apply(is_mojibake)
    # Duplicados exactos de título (indicativo, no necesariamente inválido)
    df["_title_dup"]      = df.duplicated(subset=[COL_TITLE], keep=False)
else:
    df["_title_null"] = df["_title_mojibake"] = df["_title_dup"] = pd.NA

# -------- 2) User Rating (0–10) ----------
if COL_RATING:
    r = to_num(df[COL_RATING])
    df["_rating_null"] = r.isna()
    df["_rating_out"]  = (r < 0) | (r > 100)
else:
    df["_rating_null"] = df["_rating_out"] = pd.NA

# -------- 3) Age Group Targeted (Kids/Teens/Adults/All Ages) ----------
AGE_ALLOWED = {"kids","teens","adults","all ages"}
if COL_AGE:
    age_norm = df[COL_AGE].astype(str).str.strip().str.lower()
    # normaliza variantes comunes
    age_norm = (age_norm.replace({
        "kid":"kids", "teen":"teens", "adult":"adults", "all-age":"all ages",
        "allages":"all ages", "all":"all ages"
    }))
    df["_age_invalid"] = ~age_norm.isin(AGE_ALLOWED)
else:
    df["_age_invalid"] = pd.NA

# -------- 4) Price (USD >=0, razonable <= 500) ----------
if COL_PRICE:
    p = to_num(df[COL_PRICE])
    df["_price_null"]   = p.isna()
    df["_price_neg"]    = p < 0
    df["_price_high"]   = p > 500   # umbral conservador para videojuegos
else:
    df["_price_null"] = df["_price_neg"] = df["_price_high"] = pd.NA

# -------- 5) Platform (en catálogo conocido) ----------
# Catálogo base; amplía si lo necesitas
PLAT_ALLOWED = {
    "pc","windows","mac","macos","linux",
    "playstation","ps","ps1","ps2","ps3","ps4","ps5","psp","ps vita","vita",
    "xbox","xbox 360","x360","xbox one","xbox series x","xbox series s","xbox series s/x","series x","series s",
    "nintendo switch","switch","wii u","wii","nintendo 3ds","3ds","nintendo ds","ds",
    "gamecube","gc","n64","nintendo 64","snes","super nintendo","nes","famicom",
    "game boy","gb","game boy color","gbc","game boy advance","gba",
    "android","ios","mobile","smartphone",
    "dreamcast","saturn","genesis","mega drive","atari 2600","2600","amiga"
}
if COL_PLATFORM:
    plat_norm = df[COL_PLATFORM].astype(str).map(strip_accents).str.lower()
    def plat_ok(v):
        items = split_multi(v)
        if not items: return pd.NA
        return any(x in PLAT_ALLOWED for x in items)
    df["_platform_invalid"] = ~plat_norm.apply(plat_ok).fillna(True)  # sin dato -> inválido
else:
    df["_platform_invalid"] = pd.NA

# -------- 6) Requires Special Device (Yes/No) ----------
YES_NO_ALLOWED = {"yes","no"}
YES_NO_MAP = {
    "y":"yes","n":"no","true":"yes","false":"no","1":"yes","0":"no"
}
if COL_REQ_DEV:
    req_norm = df[COL_REQ_DEV].astype(str).str.strip().str.lower()
    req_norm = req_norm.replace(YES_NO_MAP)
    df["_requires_invalid"] = ~req_norm.isin(YES_NO_ALLOWED)
else:
    df["_requires_invalid"] = pd.NA

# -------- 7) Developer ----------
if COL_DEV:
    dev_norm = df[COL_DEV].astype(str).str.strip().str.lower()
    df["_developer_invalid"] = dev_norm.isin({"", "nan", "none", "unknown", "n/a"})
else:
    df["_developer_invalid"] = pd.NA

# -------- 8) Publisher ----------
if COL_PUB:
    pub_norm = df[COL_PUB].astype(str).str.strip().str.lower()
    df["_publisher_invalid"] = pub_norm.isin({"", "nan", "none", "unknown", "n/a"})
else:
    df["_publisher_invalid"] = pd.NA

# -------- 9) Release Year (1970..present) ----------
this_year = datetime.now().year
if COL_YEAR:
    y = to_num(df[COL_YEAR])
    df["_year_null"]    = y.isna()
    df["_year_before"]  = y < 1970
    df["_year_future"]  = y > this_year
else:
    df["_year_null"] = df["_year_before"] = df["_year_future"] = pd.NA

# -------- 10) Genre (en catálogo estándar) ----------
GEN_ALLOWED = {
    "action","adventure","sports","puzzle","rpg","role-playing","racing","shooter",
    "fighting","platform","platformer","simulation","strategy","misc"
}
GEN_EQUIV = {
    "role playing":"role-playing",
    "roleplaying":"role-playing",
    "platform":"platformer",
    "indie":"misc","casual":"misc","family":"misc","party":"misc","music":"misc","rhythm":"misc","arcade":"misc","board":"misc"
}
if COL_GENRE:
    g = df[COL_GENRE].astype(str).map(strip_accents).str.lower().str.strip()
    g = g.replace(GEN_EQUIV)
    df["_genre_invalid"] = ~g.isin(GEN_ALLOWED)
else:
    df["_genre_invalid"] = pd.NA

# -------- Resumen ----------
def cnt(s): 
    try: return int(s.fillna(False).sum())
    except: return 0

print("=== RESUMEN DE VALIDEZ (según diccionario) ===")
print(f"Título nulo/ vacío          : {cnt(df.get('_title_null'))}")
print(f"Título mojibake             : {cnt(df.get('_title_mojibake'))}")
print(f"Rating nulo                 : {cnt(df.get('_rating_null'))}")
print(f"Rating fuera de [0,10]      : {cnt(df.get('_rating_out'))}")
print(f"Age Group fuera de catálogo : {cnt(df.get('_age_invalid'))}")
print(f"Price nulo                  : {cnt(df.get('_price_null'))}")
print(f"Price negativo              : {cnt(df.get('_price_neg'))}")
print(f"Price > 500 USD             : {cnt(df.get('_price_high'))}")
print(f"Platform inválida           : {cnt(df.get('_platform_invalid'))}")
print(f"RequiresDevice inválido     : {cnt(df.get('_requires_invalid'))}")
print(f"Developer inválido          : {cnt(df.get('_developer_invalid'))}")
print(f"Publisher inválido          : {cnt(df.get('_publisher_invalid'))}")
print(f"Year nulo                   : {cnt(df.get('_year_null'))}")
print(f"Year < 1970                 : {cnt(df.get('_year_before'))}")
print(f"Year futuro                 : {cnt(df.get('_year_future'))}")
print(f"Genre fuera de catálogo     : {cnt(df.get('_genre_invalid'))}")

# -------- Guardar con banderas ----------
OUT = "video_game_reviews_validated.csv"
df.to_csv(OUT, index=False, encoding="utf-8")
print(f"\nArchivo con banderas de validez -> {OUT}")


=== RESUMEN DE VALIDEZ (según diccionario) ===
Título nulo/ vacío          : 0
Título mojibake             : 0
Rating nulo                 : 0
Rating fuera de [0,10]      : 0
Age Group fuera de catálogo : 0
Price nulo                  : 0
Price negativo              : 0
Price > 500 USD             : 0
Platform inválida           : 0
RequiresDevice inválido     : 0
Developer inválido          : 0
Publisher inválido          : 0
Year nulo                   : 0
Year < 1970                 : 0
Year futuro                 : 0
Genre fuera de catálogo     : 0

Archivo con banderas de validez -> video_game_reviews_validated.csv


## Evaluación de Validez – Dataset `video_game_reviews`

### Resultados de validación según el diccionario de datos

- **Game Title**  
  No hay valores nulos, vacíos ni problemas de codificación. Los títulos son legibles y consistentes.

- **User Rating**   
  Todos los valores se encuentran dentro del rango definido (0 a 10). No se detectaron ratings inválidos.

- **Age Group Targeted**   
  Todas las observaciones están en las categorías esperadas (`Kids`, `Teens`, `Adults`, `All Ages`).

- **Price**  
  Ningún precio es nulo, negativo o excesivamente alto (>500 USD). Los valores son razonables y válidos.

- **Platform**   
  Todas las plataformas coinciden con el catálogo permitido. No hay valores como `Unknown` o `N/A`.

- **Requires Special Device**   
  Solo aparecen los valores válidos (`Yes` / `No`), sin abreviaciones ni variaciones.

- **Developer**   
  No hay valores vacíos ni genéricos. Los nombres son legibles y válidos.

- **Publisher**   
  No existen casos `Unknown` o `N/A`. Las compañías están correctamente representadas.

- **Release Year**   
  Ningún año es nulo, anterior a 1970 o en el futuro. Todos los valores son válidos.

- **Genre**   
  Todos los géneros corresponden al catálogo estándar (`Action`, `Adventure`, `Sports`, `Puzzle`, `RPG`, etc.).

---

### 📌 Conclusión
El dataset **`video_game_reviews`** cumple completamente con los criterios de **validez** definidos en su diccionario de datos.  
No se identificaron valores fuera de rango, categorías inválidas ni campos faltantes.  
Puede ser utilizado directamente para análisis sin necesidad de limpieza adicional en esta dimensión.
