In [7]:
# Paso 1: Importar las bibliotecas necesarias
import hopsworks
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
import os

# Paso 2: Cargar el dataset
data_path = "data/IMDB Dataset.csv"
df = pd.read_csv(data_path)

# Mostrar las primeras filas del dataset
df.head()

# Paso 3: Preprocesar el texto
# Convertir a minúsculas
df['review'] = df['review'].str.lower()

# Eliminar caracteres especiales
df['review'] = df['review'].str.replace(r'<.*?>', '', regex=True)  # Eliminar etiquetas HTML

# Tokenización y secuencias
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['review'])

sequences = tokenizer.texts_to_sequences(df['review'])
padded = pad_sequences(sequences, maxlen=200)

# Paso 4: Codificación de etiquetas
df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0})

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(padded, df['sentiment'], test_size=0.2, random_state=42)

# Crear un DataFrame con las secuencias y la etiqueta
df_feature_store = pd.DataFrame(X_train)

# Renombrar las columnas numéricas a 'col_0', 'col_1', ..., 'col_n'
df_feature_store.columns = ['col_' + str(i) for i in df_feature_store.columns]

# Añadir la columna de etiquetas
df_feature_store['sentiment'] = y_train.values

# Verificar que los nombres de las columnas sean válidos
print(df_feature_store.columns)

# Paso 5: Conectar a Hopsworks y guardar los datos en el Feature Store
load_dotenv()  # Cargar la API Key desde el archivo .env
api_key = os.getenv("HOPSWORKS_API_KEY")

# Conectarse a Hopsworks
project = hopsworks.login(api_key_value=api_key)
fs = project.get_feature_store()

# Crear una columna de índice única para usar como clave primaria
df_feature_store['id'] = range(len(df_feature_store))  # Crear columna de ID único

# Crear el Feature Group con la columna 'id' como primary_key
imdb_fg = fs.get_or_create_feature_group(
    name="imdb_reviews_dl",
    version=1,
    description="Preprocessed IMDB reviews for deep learning classification",
    primary_key=["id"],  # Definir la columna 'id' como clave primaria
    online_enabled=True
)

# Guardar el DataFrame en el Feature Store
imdb_fg.insert(df_feature_store)

print("Datos guardados en el Feature Store de Hopsworks.")


Index(['col_0', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7',
       'col_8', 'col_9',
       ...
       'col_191', 'col_192', 'col_193', 'col_194', 'col_195', 'col_196',
       'col_197', 'col_198', 'col_199', 'sentiment'],
      dtype='object', length=201)
Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1047704
Connected. Call `.close()` to terminate connection gracefully.
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1047704/fs/1039431/fg/1207640


Uploading Dataframe: 0.00% |          | Rows 0/40000 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: imdb_reviews_dl_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/1047704/jobs/named/imdb_reviews_dl_1_offline_fg_materialization/executions
Datos guardados en el Feature Store de Hopsworks.
