In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id, current_timestamp
from datasets import load_dataset, Dataset, DatasetDict
import huggingface_hub

# ETL of Large Source Dataset
Before uploading the dataset into a database and dataloading pipeline, the data has to be converted into I.I.E (independent and identifiable data) form by including a unique I.D and timestamp for each row. The data will also be split into its' train and validation subsets.

In [2]:
%load_ext sagemaker_studio_analytics_extension.magics
%sm_analytics emr-serverless connect --application-id 00fq6j1a0fiulq09 --language python --emr-execution-role-arn arn:aws:iam::597161074694:role/service-role/AmazonEMR-ServiceRole-20250211T131858

Initiating EMR Serverless connection..
Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
4,00fq7iq3kcnggh0a,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


In [3]:
spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "50g") \
    .appName('spark') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/13 03:32:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
tinystories = "skeskinen/TinyStories-GPT4"
train_data = load_dataset(tinystories, split="train[:2196080]")
val_data = load_dataset(tinystories, split="train[2196080:]")

In [7]:
train_data

Dataset({
    features: ['story', 'summary', 'source', 'prompt', 'words', 'features'],
    num_rows: 2196080
})

In [8]:
val_data

Dataset({
    features: ['story', 'summary', 'source', 'prompt', 'words', 'features'],
    num_rows: 549020
})

In [9]:
train_data = spark.createDataFrame(train_data)

In [10]:
val_data = spark.createDataFrame(val_data)

In [11]:
# Add columns with PySpark UDFs
train_data = train_data.withColumn("unique_id", monotonically_increasing_id()) 
train_data = train_data.withColumn("timestamp", current_timestamp())

In [12]:
# Add columns with PySpark UDFs
val_data = val_data.withColumn("unique_id", monotonically_increasing_id()) 
val_data = val_data.withColumn("timestamp", current_timestamp())

In [13]:
#converting to huggingface dataset objects
train_data = Dataset.from_spark(train_data)
val_data = Dataset.from_spark(val_data)

25/02/13 03:38:24 WARN TaskSetManager: Stage 0 contains a task of very large size (359110 KiB). The maximum recommended task size is 1000 KiB.
25/02/13 03:38:38 WARN TaskSetManager: Stage 3 contains a task of very large size (359110 KiB). The maximum recommended task size is 1000 KiB.
25/02/13 03:38:46 WARN PythonRunner: Detected deadlock while completing task 4.0 in stage 3 (TID 13): Attempting to kill Python Worker
25/02/13 03:38:46 WARN PythonRunner: Detected deadlock while completing task 7.0 in stage 3 (TID 16): Attempting to kill Python Worker
25/02/13 03:38:46 WARN PythonRunner: Detected deadlock while completing task 2.0 in stage 3 (TID 11): Attempting to kill Python Worker
25/02/13 03:38:46 WARN PythonRunner: Detected deadlock while completing task 3.0 in stage 3 (TID 12): Attempting to kill Python Worker
25/02/13 03:38:46 WARN PythonRunner: Detected deadlock while completing task 5.0 in stage 3 (TID 14): Attempting to kill Python Worker
25/02/13 03:38:46 WARN PythonRunner: De

In [14]:
df_splits = {'train': train_data, 'validation': val_data}

In [15]:
repo_id = 'Alexis-Az/TinyStories'

In [20]:
full_data = DatasetDict(df_splits)

In [21]:
full_data.push_to_hub(repo_id=repo_id)

Uploading the dataset shards:   0%|          | 0/6 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/367 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/367 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/367 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/367 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/367 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/367 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/275 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/275 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Alexis-Az/TinyStories/commit/e82c94c97f93463692798284495ba7b41fa2973d', commit_message='Upload dataset', commit_description='', oid='e82c94c97f93463692798284495ba7b41fa2973d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Alexis-Az/TinyStories', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Alexis-Az/TinyStories'), pr_revision=None, pr_num=None)