In [11]:
# ----------------------------
# Paso 1: Configurar Kaggle API
# ----------------------------
import os
import json
import zipfile
import nltk

In [2]:
# Crear carpeta de configuración de Kaggle
os.makedirs(os.path.expanduser('~/.kaggle'), exist_ok=True)

# Copiar kaggle.json a ~/.kaggle/kaggle.json
with open('../kaggle.json') as f:
    kaggle_token = json.load(f)

with open(os.path.expanduser('~/.kaggle/kaggle.json'), 'w') as f:
    json.dump(kaggle_token, f)

# Dar permisos (solo en Linux, se ignora en Windows)
os.chmod(os.path.expanduser('~/.kaggle/kaggle.json'), 0o600)

In [3]:
# Instalar kaggle si no está
!pip install -q kaggle

In [4]:

# ----------------------------
# Paso 2: Descargar el dataset desde Kaggle
# ----------------------------

!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews





Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to c:\Users\USER\Documents\Proyecto_IA\notebooks




  0%|          | 0.00/25.7M [00:00<?, ?B/s]
100%|██████████| 25.7M/25.7M [00:00<00:00, 1.09GB/s]


In [5]:
# ----------------------------
# Paso 3: Descomprimir el ZIP usando Python (compatible con Windows)
# ----------------------------

# Crear carpeta para guardar datos crudos
os.makedirs('../data/raw/', exist_ok=True)

# Ruta del archivo descargado
zip_path = 'imdb-dataset-of-50k-movie-reviews.zip'
extract_path = '../data/raw/'

# Extraer el contenido
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("✅ Dataset descargado y extraído correctamente.")

✅ Dataset descargado y extraído correctamente.


In [6]:
# ----------------------------
# Paso 4: Cargar y explorar los datos
# ----------------------------

import pandas as pd

# Cargar el CSV extraído
df = pd.read_csv('../data/raw/IMDB Dataset.csv')

# Mostrar los primeros 5 registros
print(df.head())

# Ver cantidad total y distribución de sentimientos
print(f"\nTotal de reseñas: {len(df)}")
print(df['sentiment'].value_counts())

# Verificar si hay datos nulos
print("\nDatos nulos:")
print(df.isnull().sum())


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Total de reseñas: 50000
sentiment
positive    25000
negative    25000
Name: count, dtype: int64

Datos nulos:
review       0
sentiment    0
dtype: int64


In [14]:
import sys
import os
sys.path.append(os.path.abspath('..'))

# Descargar recursos punkt y punkt_tab de NLTK si no están presentes
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

from utils.preprocessing import limpiar_dataset

# Aplicar limpieza
df_limpio = limpiar_dataset(df, texto_col='review', etiqueta_col='sentiment')

# Ver muestra del resultado
print(df_limpio.head())

# Guardar CSV limpio en carpeta processed
os.makedirs('../data/processed/', exist_ok=True)
df_limpio.to_csv('../data/processed/imdb_limpio.csv', index=False)

print("✅ Dataset limpiado y guardado en data/processed/imdb_limpio.csv")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


                                              review sentiment
0  one reviewers mentioned watching oz episode yo...  positive
1  wonderful little production filming technique ...  positive
2  thought wonderful way spend time hot summer we...  positive
3  basically theres family little boy jake thinks...  negative
4  petter matteis love time money visually stunni...  positive
✅ Dataset limpiado y guardado en data/processed/imdb_limpio.csv
