In [35]:
import sagemaker, boto3, os
import string

from sklearn.model_selection import train_test_split

In [25]:
# Specify the S3 bucket and object name
s3_client = boto3.client('s3')

bucket_name = 'gen-ai-test-kenvue'
object_key = 'StephenKingBooks/Stephen_King_TheShining.txt'

# Define the local file path where you want to save the downloaded object
local_file_path = 'Stephen_King_TheShining.txt'

# Download the object
s3_client.download_file(bucket_name, object_key, local_file_path)

print(f"Downloaded '{object_key}' from S3 bucket '{bucket_name}' to '{local_file_path}'")


Downloaded 'StephenKingBooks/Stephen_King_TheShining.txt' from S3 bucket 'gen-ai-test-kenvue' to 'Stephen_King_TheShining.txt'


In [27]:
file_path = "./Stephen_King_TheShining.txt"

with open(file_path, 'r', encoding='utf-8') as file:
    book = file.read()

In [28]:
import re
import unicodedata

# El preprocesamento en castellano requiere más trabajo

# Referencia de regex:
# https://docs.python.org/3/library/re.html

def preprocess_clean_text(text):
    # sacar tildes de las palabras:
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # quitar caracteres especiales
    pattern = r'[^a-zA-z0-9.,!?/:;\"\'\s]' # [^ : ningún caracter de todos estos
    # (termina eliminando cualquier caracter distinto de los del regex)
    text = re.sub(pattern, '', text)
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' # igual al anterior pero sin cifras numéricas
    # quitar números
    text = re.sub(pattern, '', text)
    # quitar caracteres de puntuación
    text = ''.join([c for c in text if c not in string.punctuation])
    return text

In [34]:
# Antes de preprocesar los datos se pasa a minúsculas todo el texto
book_processed = preprocess_clean_text(book.lower())

In [36]:
# guardamos archivo procesado
# Ruta del archivo de salida
archivo_txt = "book_processed.txt"

# Abre el archivo en modo escritura y guarda el contenido de la variable en él
with open(archivo_txt, "w", encoding="utf-8") as archivo:
    archivo.write(book_processed)

In [37]:
# pasar a Json
import json

# Input and output file paths
txt_file_path = 'book_processed.txt'
jsonl_file_path = 'book_processed.jsonl'

# Function to convert a line of text to a JSON object
def line_to_json(line):
    # You can customize this part based on your TXT file format
    # In this example, we assume each line is a string and we create a JSON object with a "text" field.
    return {"text": line.strip()}  # Change this to match your TXT file structure

# Open the input and output files
with open(txt_file_path, 'r', encoding='utf-8') as txt_file, open(jsonl_file_path, 'w', encoding='utf-8') as jsonl_file:
    # Process each line from the TXT file and write it as a JSON object to the JSONL file
    for line in txt_file:
        json_object = line_to_json(line)
        jsonl_file.write(json.dumps(json_object) + '\n')

print(f'Conversion from TXT to JSONL completed. Output file: {jsonl_file_path}')

Conversion from TXT to JSONL completed. Output file: book_processed.jsonl


In [38]:
#s3_client = boto3.client('s3')
bucket_name = 'gen-ai-test-kenvue'
object_key = 'trainset_summary_model/Stephen_King_TheShining.txt'

boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join('trainset_summary_model/train.jsonl')).upload_file('book_processed.jsonl')