## Carga de Datos 

In [2]:
import pandas as pd

import os
import numpy as np
import pandas as pd
from sklearn import tree
import sklearn as sklearn

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D # for 3D plots
import seaborn as sns; sns.set()

df_kaggle = pd.read_csv("vgsales.csv")
print(df_kaggle.head())


   Rank                      Name Platform    Year         Genre Publisher  \
0     1                Wii Sports      Wii  2006.0        Sports  Nintendo   
1     2         Super Mario Bros.      NES  1985.0      Platform  Nintendo   
2     3            Mario Kart Wii      Wii  2008.0        Racing  Nintendo   
3     4         Wii Sports Resort      Wii  2009.0        Sports  Nintendo   
4     5  Pokemon Red/Pokemon Blue       GB  1996.0  Role-Playing  Nintendo   

   NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  
0     41.49     29.02      3.77         8.46         82.74  
1     29.08      3.58      6.81         0.77         40.24  
2     15.85     12.88      3.79         3.31         35.82  
3     15.75     11.01      3.28         2.96         33.00  
4     11.27      8.89     10.22         1.00         31.37  


In [62]:
import os, time, math, requests, pandas as pd
from pandas import json_normalize

API_KEY = "601b475a2c3342c1860c7f432da840ca"          # <-- pon tu key
TARGET_N = 17000                # apunta un poco arriba de 16k por pérdidas al limpiar
PAGE_SIZE = 40                  # máximo permitido por RAWG
SLEEP_BASE = 0.8                # espera entre llamadas (ajústalo si ves 429)
OUT_EVERY = 1000                # guarda en disco cada N filas
OUT_CSV = "rawg_games_raw.csv"  # salida incremental
FINAL_CSV = "rawg_games_clean.csv"

def fetch_page(page: int, ordering="-added", dates=None):
    """
    ordering: ver docs RAWG; '-added' o '-rating' suelen devolver populares primero.
    dates: '2000-01-01,2025-12-31' para acotar (opcional).
    """
    params = {
        "key": API_KEY,
        "page": page,
        "page_size": PAGE_SIZE,
        "ordering": ordering
    }
    if dates:
        params["dates"] = dates

    # reintentos simples con backoff si hay 429/5xx
    for attempt in range(6):
        r = requests.get("https://api.rawg.io/api/games", params=params, timeout=30)
        if r.status_code == 200:
            return r.json()
        if r.status_code in (429, 500, 502, 503, 504):
            wait = SLEEP_BASE * (2 ** attempt)
            time.sleep(wait)
        else:
            r.raise_for_status()
    raise RuntimeError(f"Fallo permanente en página {page}: {r.status_code} {r.text[:200]}")

# --- Descarga paginada con guardado incremental ---
all_rows = []
seen_ids = set()
total_fetched = 0
page = 1

# Si ya existe un CSV previo, retoma desde ahí
if os.path.exists(OUT_CSV):
    prev = pd.read_csv(OUT_CSV)
    seen_ids = set(prev["id"].tolist())
    total_fetched = len(prev)
    print(f"Retomando: {total_fetched} filas ya guardadas.")
    all_rows = prev.to_dict(orient="records")

while total_fetched < TARGET_N:
    data = fetch_page(page, ordering="-added")   # puedes usar también "-rating" o "-metacritic"
    results = data.get("results", [])
    if not results:
        print("No hay más resultados (next=None).")
        break

    # filtra duplicados por id
    new = [row for row in results if row["id"] not in seen_ids]
    for row in new:
        seen_ids.add(row["id"])
        all_rows.append(row)

    total_fetched = len(all_rows)
    print(f"Página {page} | nuevos={len(new)} | total={total_fetched}")

    # guardado incremental
    if total_fetched % OUT_EVERY < PAGE_SIZE:  # aprox cada OUT_EVERY
        pd.DataFrame(all_rows).to_csv(OUT_CSV, index=False)
        print(f"Guardado parcial -> {OUT_CSV}")

    page += 1
    time.sleep(SLEEP_BASE)  # cortesía para no saturar

# guardado final crudo
pd.DataFrame(all_rows).to_csv(OUT_CSV, index=False)
print(f"Descarga cruda completa: {len(all_rows)} filas -> {OUT_CSV}")

# --- Normalización y limpieza (aplanar columnas útiles) ---
df = pd.read_csv(OUT_CSV)

# json_normalize maneja listas/dict si cargamos desde objetos; si cargaste desde csv,
# vuelve a leer desde 'all_rows' o re-normaliza antes de guardar. Aquí rehacemos:
df_norm = json_normalize(all_rows, max_level=1)

# columnas derivadas legibles
def join_list(objs, path_names):
    if isinstance(objs, list):
        try:
            if path_names == "platforms":
                return ", ".join(o["platform"]["name"] for o in objs)
            if path_names == "genres":
                return ", ".join(o["name"] for o in objs)
            if path_names == "stores":
                return ", ".join(o["store"]["name"] for o in objs)
        except Exception:
            return None
    return None

df_norm["platforms_names"] = df_norm["platforms"].apply(lambda x: join_list(x, "platforms"))
df_norm["genres_names"]     = df_norm["genres"].apply(lambda x: join_list(x, "genres"))
df_norm["stores_names"]     = df_norm["stores"].apply(lambda x: join_list(x, "stores"))

# selecciona columnas clave
cols = [
    "id","slug","name","released","rating","ratings_count","metacritic","playtime",
    "added","suggestions_count","updated","platforms_names","genres_names","stores_names",
    "background_image","esrb_rating.name"
]
cols = [c for c in cols if c in df_norm.columns]
df_rawg = df_norm[cols].copy()

# elimina duplicados (por id y por nombre por si acaso)
df_rawg = df_rawg.drop_duplicates(subset=["id"])
df_rawg = df_rawg.drop_duplicates(subset=["name"])  # opcional

# guarda limpio
df_rawg.to_csv(FINAL_CSV, index=False)
print(f"Limpio listo: {len(df_rawg)} filas -> {FINAL_CSV}")





Página 1 | nuevos=40 | total=40
Página 2 | nuevos=40 | total=80
Página 3 | nuevos=40 | total=120
Página 4 | nuevos=40 | total=160
Página 5 | nuevos=40 | total=200
Página 6 | nuevos=40 | total=240
Página 7 | nuevos=40 | total=280
Página 8 | nuevos=40 | total=320
Página 9 | nuevos=40 | total=360
Página 10 | nuevos=40 | total=400
Página 11 | nuevos=40 | total=440
Página 12 | nuevos=40 | total=480
Página 13 | nuevos=40 | total=520
Página 14 | nuevos=40 | total=560
Página 15 | nuevos=40 | total=600
Página 16 | nuevos=40 | total=640
Página 17 | nuevos=40 | total=680
Página 18 | nuevos=40 | total=720
Página 19 | nuevos=40 | total=760
Página 20 | nuevos=40 | total=800
Página 21 | nuevos=40 | total=840
Página 22 | nuevos=40 | total=880
Página 23 | nuevos=40 | total=920
Página 24 | nuevos=40 | total=960
Página 25 | nuevos=40 | total=1000
Guardado parcial -> rawg_games_raw.csv
Página 26 | nuevos=40 | total=1040
Página 27 | nuevos=40 | total=1080
Página 28 | nuevos=40 | total=1120
Página 29 | nuev

In [63]:
print("RAWG cols:", df_rawg.columns.tolist())
print(df_rawg[["name","platforms_names","genres_names","stores_names"]].head(3))
print(df_rawg[["platforms_names","genres_names","stores_names"]].isna().mean() * 100)


RAWG cols: ['id', 'slug', 'name', 'released', 'rating', 'ratings_count', 'metacritic', 'playtime', 'added', 'suggestions_count', 'updated', 'platforms_names', 'genres_names', 'stores_names', 'background_image', 'esrb_rating.name']
                       name  \
0        Grand Theft Auto V   
1  The Witcher 3: Wild Hunt   
2                  Portal 2   

                                     platforms_names     genres_names  \
0  PC, PlayStation 5, Xbox Series S/X, PlayStatio...           Action   
1  PlayStation 5, Xbox Series S/X, macOS, PlaySta...      Action, RPG   
2  PlayStation 3, PC, Xbox 360, Linux, macOS, Xbo...  Shooter, Puzzle   

                                        stores_names  
0  Steam, PlayStation Store, Epic Games, Xbox 360...  
1  GOG, PlayStation Store, Steam, Xbox Store, Nin...  
2  Xbox Store, Steam, PlayStation Store, Xbox 360...  
platforms_names    0.0
genres_names       0.0
stores_names       0.0
dtype: float64


In [None]:
from rapidfuzz import process, fuzz
import pandas as pd
import re, unicodedata

# --- Función para normalizar nombres ---
def normalize_name(s: str) -> str | None:
    if pd.isna(s): 
        return None
    # quitar acentos/tildes
    s = unicodedata.normalize("NFKD", str(s)).encode("ascii", "ignore").decode("ascii")
    # minúsculas
    s = s.lower()
    # eliminar texto entre paréntesis
    s = re.sub(r"\s*\(.*?\)", "", s)
    # eliminar palabras comunes irrelevantes
    s = re.sub(r"\b(edition|remastered|hd|ultimate|complete|definitive|remake|collection)\b", "", s)
    # quitar caracteres no alfanuméricos
    s = re.sub(r"[^a-z0-9]+", " ", s)
    return re.sub(r"\s+", " ", s).strip()

# --- Crear columnas normalizadas ---
df_kaggle["name_norm"] = df_kaggle["Name"].apply(normalize_name)
df_rawg["name_norm"]   = df_rawg["name"].apply(normalize_name)

# --- Matching usando la versión normalizada ---
rawg_names = df_rawg["name_norm"].tolist()
matches = []

for i, name in enumerate(df_kaggle["name_norm"]):
    match, score, idx = process.extractOne(name, rawg_names, scorer=fuzz.token_sort_ratio)

    kaggle_row = df_kaggle.iloc[i].to_dict()
    rawg_row   = df_rawg.iloc[idx].to_dict()

    combined = {**kaggle_row, **rawg_row, "match_score": score}
    matches.append(combined)

# Crear DataFrame final
df_matches = pd.DataFrame(matches)





In [83]:
# Guardar CSV
df_matches.to_csv("data.csv", index=False, encoding="utf-8")
print(df_matches.columns.tolist())

['Rank', 'Name', 'Platform', 'Genre', 'Publisher', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'id', 'slug', 'released', 'rating', 'ratings_count', 'metacritic', 'playtime', 'added', 'suggestions_count', 'updated', 'platforms_names', 'genres_names', 'stores_names', 'background_image', 'esrb_rating.name', 'match_score']


In [3]:
df = pd.read_csv("data.csv")

# Exploración de Datos

## Completitud

In [4]:
#Completitud

df.isnull().sum()

print ( df.isnull().sum()/len(df)*100)


Rank                  0.000000
Name                  0.000000
Platform              0.000000
Genre                 0.000000
Publisher             0.349440
NA_Sales              0.000000
EU_Sales              0.000000
JP_Sales              0.000000
Other_Sales           0.000000
Global_Sales          0.000000
id                    0.000000
slug                  0.000000
released              1.608628
rating                0.000000
ratings_count         0.000000
metacritic           54.289673
playtime              0.000000
added                 0.000000
suggestions_count     0.000000
updated               0.000000
platforms_names       0.000000
genres_names          1.331486
stores_names          9.230028
background_image      0.234968
esrb_rating.name     45.372936
match_score           0.000000
dtype: float64


Eliminar las columnas metacritic ya que más del 50% de los datos estan faltando por ende si se llegará a imputar el promedio o la media se generaría un sesgo sobre estos. Asimismo, se decide eliminar esrb_rating.name. Para el resto de faltantes se dicide eliminar las filas que no tengan algun dato, ya que la cantidad es mínima.

In [5]:
df.shape

(16598, 26)

## Unicidad

In [7]:
#Unicidad
# 1) Duplicados exactos de filas completas
dup_mask_all = df.duplicated(keep=False)
df.loc[dup_mask_all].head()




Unnamed: 0,Rank,Name,Platform,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,...,playtime,added,suggestions_count,updated,platforms_names,genres_names,stores_names,background_image,esrb_rating.name,match_score


No hay duplicados por ende no es necesario eliminar ninguna fila.

In [8]:

print(df.dtypes)

Rank                   int64
Name                  object
Platform              object
Genre                 object
Publisher             object
NA_Sales             float64
EU_Sales             float64
JP_Sales             float64
Other_Sales          float64
Global_Sales         float64
id                     int64
slug                  object
released              object
rating               float64
ratings_count          int64
metacritic           float64
playtime               int64
added                  int64
suggestions_count      int64
updated               object
platforms_names       object
genres_names          object
stores_names          object
background_image      object
esrb_rating.name      object
match_score          float64
dtype: object


## Consistencia

In [11]:
# ================================
# Consistency checks (SIN corregir ni modificar el dataset)
# ================================
import pandas as pd
from datetime import datetime, timezone

# Cargar (ajusta ruta si ya tienes df en memoria)
df = pd.read_csv("data.csv")

# ----------------------------
# 1) FECHAS: released & updated
# ----------------------------
today = pd.Timestamp(datetime.now(tz=timezone.utc).date())

released_dt = pd.to_datetime(df.get("released"), errors="coerce", utc=True)
updated_dt  = pd.to_datetime(df.get("updated"),  errors="coerce", utc=True)

mask_released_invalid   = released_dt.isna() & df.get("released").notna()
mask_released_too_old   = released_dt.dt.year.lt(1970)    # < 1970
mask_released_in_future = released_dt.dt.date.gt(today.date())
mask_updated_before_rel = (updated_dt < released_dt) & updated_dt.notna() & released_dt.notna()

print("=== FECHAS ===")
print("released inválida  :", int(mask_released_invalid.sum()))
print("released < 1970    :", int(mask_released_too_old.sum()))
print("released en futuro :", int(mask_released_in_future.sum()))
print("updated < released :", int(mask_updated_before_rel.sum()))
print(df.loc[
    mask_released_invalid | mask_released_too_old | mask_released_in_future | mask_updated_before_rel,
    [c for c in ["Name","released","updated"] if c in df.columns]
].head(10), "\n")

# -----------------------------------------------
# 2) VENTAS: NA + EU + JP + Other ≈ Global_Sales
# -----------------------------------------------
sales_cols = ["NA_Sales","EU_Sales","JP_Sales","Other_Sales","Global_Sales"]
if all(c in df.columns for c in sales_cols):
    TOL = 0.01  # 0.01 millones ≈ 10k unidades
    reg_sum = (
        df["NA_Sales"].fillna(0) + df["EU_Sales"].fillna(0) +
        df["JP_Sales"].fillna(0) + df["Other_Sales"].fillna(0)
    )
    diff = (reg_sum - df["Global_Sales"].fillna(0)).abs()
    mask_incoh = diff > TOL

    print("=== VENTAS ===")
    print(f"Inconsistencias (> {TOL}): {int(mask_incoh.sum())} de {len(df)} ({mask_incoh.mean()*100:.2f}%)")
    print(df.loc[mask_incoh, ["Name","Platform"] + sales_cols].assign(_sales_diff=diff[mask_incoh]).head(10), "\n")

# ---------------------------------------------------------
# 3) PLATAFORMAS: Platform (Kaggle) vs platforms_names (RAWG)
# ---------------------------------------------------------
PLAT_MAP = {
    "PS":"PlayStation","PS2":"PlayStation 2","PS3":"PlayStation 3","PS4":"PlayStation 4","PS5":"PlayStation 5",
    "X360":"Xbox 360","XB":"Xbox","XOne":"Xbox One","XS":"Xbox Series S/X",
    "Wii":"Wii","WiiU":"Wii U","NS":"Nintendo Switch","3DS":"Nintendo 3DS","DS":"Nintendo DS",
    "GB":"Game Boy","GBA":"Game Boy Advance","GC":"GameCube","N64":"Nintendo 64",
    "PC":"PC","MAC":"macOS","SNES":"SNES","NES":"NES","PSP":"PSP","PSV":"PS Vita"
}

if "Platform" in df.columns and "platforms_names" in df.columns:
    plat_expected = df["Platform"].astype(str).map(lambda x: PLAT_MAP.get(x, x))
    platforms_txt = df["platforms_names"].astype(str)

    def contains_platform(expected, txt):
        if pd.isna(expected) or not isinstance(txt, str): 
            return pd.NA
        return expected in [p.strip() for p in txt.split(",")]

    platform_match = [contains_platform(e, t) for e, t in zip(plat_expected, platforms_txt)]
    platform_match = pd.Series(platform_match, index=df.index)

    pct_plat = platform_match.mean()*100 if platform_match.notna().any() else 0.0
    print("=== PLATAFORMAS ===")
    print(f"Match Platform ∈ platforms_names: {pct_plat:.2f}%")
    print(df.loc[platform_match==False, ["Name","Platform"]].assign(
        plat_expected=plat_expected[platform_match==False],
        platforms_names=platforms_txt[platform_match==False]
    ).head(10), "\n")

# -------------------------------------------------
# 4) GÉNERO: Genre (Kaggle) ∈ genres_names (RAWG)
# -------------------------------------------------
if "Genre" in df.columns and "genres_names" in df.columns:
    genre_k = df["Genre"].astype(str)
    genres_rawg = df["genres_names"].astype(str)

    def genre_in(gk, gr):
        if gk.lower() == "nan" or gr.lower() == "nan":
            return pd.NA
        return any(gk.strip().lower() == x.strip().lower() for x in gr.split(","))

    genre_match = [genre_in(gk, gr) for gk, gr in zip(genre_k, genres_rawg)]
    genre_match = pd.Series(genre_match, index=df.index)

    pct_g = genre_match.mean()*100 if genre_match.notna().any() else 0.0
    print("=== GÉNERO ===")
    print(f"Genre contenido en genres_names: {pct_g:.2f}%")
    print(df.loc[genre_match==False, ["Name","Genre","genres_names"]].head(10))


=== FECHAS ===
released inválida  : 0
released < 1970    : 0
released en futuro : 3
updated < released : 4
                Name    released              updated
6055       WWII Aces  2025-09-25  2025-09-20T01:00:52
11123    Crimson Sea  2026-03-31  2025-09-23T20:17:33
12019  Crimson Sea 2  2026-03-31  2025-09-23T20:17:33
14188  Crimson Tears  2026-03-31  2025-09-23T20:17:33 

=== VENTAS ===
Inconsistencias (> 0.01): 2625 de 16598 (15.82%)
                           Name Platform  NA_Sales  EU_Sales  JP_Sales  \
2                Mario Kart Wii      Wii     15.85     12.88      3.79   
4      Pokemon Red/Pokemon Blue       GB     11.27      8.89     10.22   
7                      Wii Play      Wii     14.03      9.20      2.93   
8     New Super Mario Bros. Wii      Wii     14.59      7.06      4.70   
10                   Nintendogs       DS      9.07     11.00      1.93   
12  Pokemon Gold/Pokemon Silver       GB      9.00      6.18      7.20   
18            Super Mario World     SNE



**Fechas**
- **Eliminar** registros con `released` en el futuro.  
- **Eliminar** registros donde `updated < released`.  
- Mantener únicamente juegos con fechas válidas y consistentes para análisis históricos.  

**Ventas**
- **Recalcular `Global_Sales`** como:  Global_Sales_clean = NA_Sales + EU_Sales + JP_Sales + Other_Sales
- Conservar la columna original (`Global_Sales`) solo como referencia.  
- Utilizar siempre `Global_Sales_clean` en los análisis para garantizar coherencia.  

**Plataformas**
- Usar **`platforms_names` de RAWG** como columna oficial de plataformas.  
- Conservar `Platform` de Kaggle únicamente como columna auxiliar para validaciones.  
- *(Opcional)* Crear un flag `platform_match` que indique coincidencias o discrepancias, pero usar solo RAWG en los análisis finales.  

**Género**
- Usar **`Genre` de Kaggle** como categoría principal (es más simple y representativa).  
- Mantener `genres_names` de RAWG como columna secundaria para enriquecer análisis avanzados (ej. juegos multigénero).  





## Validez



1. Ventas
- Los juegos más vendidos históricamente alcanzan entre **40 y 82 millones de copias** (*Wii Sports*, *Super Mario Bros.*).  
- **Verificaciones:**
  - `Global_Sales` ≤ 100M (umbral razonable).  
  - Ninguna venta regional (`NA_Sales`, `EU_Sales`, `JP_Sales`, `Other_Sales`) negativa.  
  - `Global_Sales` ≥ máximo de las ventas regionales.  

2. Fechas
- Primeros videojuegos comerciales desde **1970**.  
- RAWG incluye fechas futuras, pero deben corresponder a lanzamientos reales o próximos.  
- **Verificaciones:**
  - `released` ≥ 1970.  
  - `released` ≤ fecha actual (salvo *coming soon*).  
  - `updated` ≥ `released` (última actualización posterior al lanzamiento).  

3. Ratings
- `rating` en RAWG: escala **0–5**.  
- `metacritic`: escala **0–100**.  
- **Verificaciones:**
  - Ningún valor fuera de rango.  
  - Juegos con `ratings_count` alto pero `rating = 0` → sospechosos.  


4. Plataformas
- Plataformas deben corresponder a consolas reales: `NES`, `SNES`, `Wii`, `PS2`, `Xbox 360`, `PC`, etc.  
- **Verificaciones:**
  - `Platform` de Kaggle debe estar en un diccionario de abreviaturas válidas.  
  - `platforms_names` de RAWG no debe estar vacío (sobre todo en juegos modernos).  
  - Normalización de equivalencias (`PS` = `PlayStation`).  


5. Géneros
- Géneros comunes: *Action, Sports, RPG, Racing, Shooter, Adventure, Fighting, Platformer*.  
- **Verificaciones:**
  - `Genre` de Kaggle dentro del catálogo esperado.  
  - Evitar valores ambiguos como `"Misc"` o `"N/A"`.  
  - Validación cruzada: `Genre` de Kaggle ∈ `genres_names` de RAWG.  


6. Publisher / Developer
- Publishers legítimos: *Nintendo, EA, Ubisoft, Activision, Square Enix*.  
- **Verificaciones:**
  - Evitar valores `"Unknown"`, nulos o inconsistentes.  
  - Unificar duplicados (*EA* vs *Electronic Arts*).  
  - Revisar outliers de frecuencia (publishers que aparecen una sola vez).  


## 📌 Conclusión
La validez contextual asegura que los datos:
- **Respetan rangos realistas** (ventas, ratings, fechas).  
- **Mantienen coherencia histórica** (fechas plausibles).  
- **Usan catálogos válidos** (plataformas, géneros, publishers).  

Esto garantiza que los análisis y el modelo dimensional se basen en información confiable.


In [7]:
from IPython.display import display, HTML
import pandas as pd

# (Opcional) Ajustes de visualización
pd.set_option("display.max_rows", 200)      # aumenta si necesitas ver más filas
pd.set_option("display.max_columns", 200)   # aumenta si necesitas ver más columnas
pd.set_option("display.float_format", lambda x: f"{x:,.4f}")  # formato numérico

def mostrar_resumen(res):
    # 1) Info básica
    print("▶ Info básica")
    display(res["basic_info"])

    # 2) Perfil de columnas
    print("\n▶ Perfil de columnas")
    colprof = res["column_profile"]
    display(colprof if colprof.shape[0] <= 60 else colprof.head(60))

    # 3) Estadísticos numéricos
    print("\n▶ Estadísticos numéricos")
    numdesc = res["numeric_describe"]
    if not numdesc.empty:
        display(numdesc if numdesc.shape[0] <= 60 else numdesc.head(60))
    else:
        print("(No hay columnas numéricas)")

    # 4) Matriz de correlación (si aplica)
    print("\n▶ Matriz de correlación (numérica)")
    corr = res["correlation"]
    if isinstance(corr, pd.DataFrame) and not corr.empty:
        # (Opcional) Estilo de calor para leer mejor
        display(corr.style.background_gradient())
    else:
        print("(No hay suficientes columnas numéricas para correlación)")

    # 5) Top-10 categorías por columna no numérica
    print("\n▶ Top-10 categorías por columna no numérica")
    topk = res["categorical_top10"]
    if isinstance(topk, pd.DataFrame) and not topk.empty:
        display(topk if topk.shape[0] <= 100 else topk.head(100))
    else:
        print("(No hay columnas categóricas o están vacías)")

    # 6) Filas duplicadas
    print("\n▶ Filas duplicadas")
    display(res["duplicates"])

# Mostrar en la celda:
mostrar_resumen(resumen)




▶ Info básica


Unnamed: 0,dataset,rows,columns,memory_MB
0,VideoGames,16598,26,16.276



▶ Perfil de columnas


Unnamed: 0,column,dtype,nunique,missing,missing_%,examples
0,Rank,int64,,,,"1, 2, 3"
1,Name,object,,,,"Wii Sports, Super Mario Bros., Mario Kart Wii"
2,Platform,object,,,,"Wii, NES, Wii"
3,Genre,object,,,,"Sports, Platform, Racing"
4,Publisher,object,,,,"Nintendo, Nintendo, Nintendo"
5,NA_Sales,float64,,,,"41.49, 29.08, 15.85"
6,EU_Sales,float64,,,,"29.02, 3.58, 12.88"
7,JP_Sales,float64,,,,"3.77, 6.81, 3.79"
8,Other_Sales,float64,,,,"8.46, 0.77, 3.31"
9,Global_Sales,float64,,,,"82.74, 40.24, 35.82"



▶ Estadísticos numéricos


Unnamed: 0,column,count,mean,std,min,25%,50%,75%,max,IQR,outliers_IQR,skew,kurtosis,zero_%
0,Rank,16598.0,8300.6053,4791.8539,1.0,4151.25,8300.5,12449.75,16600.0,8298.5,0,0.0001,-1.1999,0.0
1,NA_Sales,16598.0,0.2647,0.8167,0.0,0.0,0.08,0.24,41.49,0.24,1681,18.7996,649.1303,27.11
2,EU_Sales,16598.0,0.1467,0.5054,0.0,0.0,0.02,0.11,29.02,0.11,2081,18.8755,756.0278,34.52
3,JP_Sales,16598.0,0.0778,0.3093,0.0,0.0,0.0,0.04,10.22,0.04,2425,11.2065,194.234,62.99
4,Other_Sales,16598.0,0.0481,0.1886,0.0,0.0,0.01,0.04,10.57,0.04,1665,24.2339,1025.3481,39.02
5,Global_Sales,16598.0,0.5374,1.555,0.01,0.06,0.17,0.47,82.74,0.41,1893,17.4006,603.9323,0.0
6,id,16598.0,83528.7697,173947.4414,1.0,10241.0,21054.0,49057.0,1003457.0,38816.0,2221,3.0305,9.0849,0.0
7,rating,16598.0,2.9712,1.3516,0.0,2.65,3.44,3.9,4.75,1.25,2296,-1.2895,0.4714,13.83
8,ratings_count,16598.0,175.0234,448.5983,0.0,12.0,40.0,125.0,7215.0,113.0,2156,6.1983,56.5672,2.28
9,metacritic,7587.0,74.339,11.4865,21.0,67.0,76.0,82.0,99.0,15.0,154,-0.8273,0.9491,0.0



▶ Matriz de correlación (numérica)


Unnamed: 0,Rank,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,id,rating,ratings_count,metacritic,playtime,added,suggestions_count,match_score
Rank,1.0,-0.401362,-0.379123,-0.267785,-0.332986,-0.427407,0.081352,-0.178507,-0.170122,-0.149891,-0.081338,-0.119348,-0.05919,-0.361432
NA_Sales,-0.401362,1.0,0.767727,0.449787,0.634737,0.941047,-0.04687,0.119099,0.177345,0.156074,0.120511,0.111016,-0.003957,0.212282
EU_Sales,-0.379123,0.767727,1.0,0.435584,0.726385,0.902836,-0.051621,0.121972,0.230776,0.145351,0.145096,0.166482,0.026004,0.208779
JP_Sales,-0.267785,0.449787,0.435584,1.0,0.290186,0.611816,-0.003251,0.072855,0.007079,0.094027,0.138146,-0.026102,-0.060616,0.054317
Other_Sales,-0.332986,0.634737,0.726385,0.290186,1.0,0.748331,-0.049278,0.106435,0.228039,0.128076,0.095709,0.162357,0.043807,0.189139
Global_Sales,-0.427407,0.941047,0.902836,0.611816,0.748331,1.0,-0.047996,0.129513,0.197164,0.159706,0.149484,0.126867,-0.000405,0.213026
id,0.081352,-0.04687,-0.051621,-0.003251,-0.049278,-0.047996,1.0,-0.040086,-0.1128,0.0006,-0.023331,-0.124325,-0.178431,-0.197936
rating,-0.178507,0.119099,0.121972,0.072855,0.106435,0.129513,-0.040086,1.0,0.268752,0.599184,0.093383,0.245733,0.264282,0.334437
ratings_count,-0.170122,0.177345,0.230776,0.007079,0.228039,0.197164,-0.1128,0.268752,1.0,0.337174,0.140002,0.928259,0.246187,0.250028
metacritic,-0.149891,0.156074,0.145351,0.094027,0.128076,0.159706,0.0006,0.599184,0.337174,1.0,0.1735,0.30908,-0.0789,0.079366



▶ Top-10 categorías por columna no numérica


Unnamed: 0,column,value,count
0,Name,Need for Speed: Most Wanted,12
1,Name,FIFA 14,9
2,Name,LEGO Marvel Super Heroes,9
3,Name,Madden NFL 07,9
4,Name,Ratatouille,9
5,Name,Terraria,8
6,Name,LEGO Star Wars II: The Original Trilogy,8
7,Name,The LEGO Movie Videogame,8
8,Name,Madden NFL 08,8
9,Name,FIFA 15,8



▶ Filas duplicadas


Unnamed: 0,dataset,duplicate_rows
0,VideoGames,0


En términos descriptivos, las variables de ventas presentan medias claramente superiores a las medianas, lo que evidencia asimetría positiva: la “tendencia central” real está mejor representada por la mediana y el IQR que por la media y la desviación estándar. Los máximos están muy alejados del rango intercuartílico (P75), confirmando la presencia de títulos con ventas extraordinarias que empujan hacia arriba los promedios; al mismo tiempo hay una alta proporción de ceros en varias regiones, lo que aumenta la dispersión y reduce la capacidad de la media para resumir el comportamiento típico. Entre numéricas se observan correlaciones fuertes entre ventas regionales y el total global (multicolinealidad esperable), mientras que en el detalle por región los outliers (según 1.5×IQR) concentran buena parte de la varianza. En conjunto: para resumir estos datos conviene reportar mediana, IQR, mínimos y máximos (además de media y desviación estándar), y acompañar con una breve nota sobre proporción de ceros y correlaciones entre ventas regionales y globales, evitando que unos pocos valores extremos distorsionen la lectura.