In [1]:
from pyspark.sql import SparkSession
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Initialize Spark session
spark = SparkSession.builder.appName("FinancialTextClassification").getOrCreate()

25/04/20 20:09:29 WARN Utils: Your hostname, Sameers-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.29.34 instead (on interface en0)
25/04/20 20:09:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/20 20:09:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 57781)
Traceback (most recent call last):
  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/socketserver.py", line 747, in __init__
    self.handle()
  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/pyspark/accumul

In [3]:
# Load dataset from Hugging Face
dataset = load_dataset("nickmuchi/financial-text-combo-classification", split="train")

Generating train split: 100%|██████████| 17971/17971 [00:00<00:00, 319797.02 examples/s]
Generating validation split: 100%|██████████| 3863/3863 [00:00<00:00, 1365003.90 examples/s]


In [4]:
# Convert to pandas DataFrame first
df_pandas = pd.DataFrame(dataset)

In [5]:
# Convert pandas DataFrame to PySpark DataFrame
df_spark = spark.createDataFrame(df_pandas)

In [12]:
# Inspect the data
df_spark.printSchema()
df_spark.show(2, truncate=50)

root
 |-- text: string (nullable = true)
 |-- label: long (nullable = true)

+--------------------------------------------------+-----+
|                                              text|label|
+--------------------------------------------------+-----+
|$BYND - JPMorgan reels in expectations on Beyon...|    0|
|$CCL $RCL - Nomura points to bookings weakness ...|    0|
+--------------------------------------------------+-----+
only showing top 2 rows



In [15]:
df_spark.count()

17971

In [16]:
# Show distinct labels
distinct_labels = df_spark.select("label").distinct()
distinct_labels.show()

+-----+
|label|
+-----+
|    0|
|    1|
|    2|
+-----+



In [None]:
# df = df_spark

In [19]:
# Step 1: PySpark → pandas
df = df_spark.select("text", "label").toPandas()

# Step 2: pandas → NumPy arrays
texts = df["text"].astype(str).values
labels = df["label"].astype(int).values

# Step 3: NumPy → tf.data.Dataset
import tensorflow as tf

ds = tf.data.Dataset.from_tensor_slices((texts, labels))
ds = ds.shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)

2025-04-20 20:50:01.240350: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2025-04-20 20:50:01.240530: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-04-20 20:50:01.240542: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2025-04-20 20:50:01.241062: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-04-20 20:50:01.241596: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [20]:
# 1. Create the vectorizer
vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=20000,
    output_sequence_length=100
)
vectorizer.adapt(texts)  # Fit on training data only

# 2. Apply to dataset
ds = ds.map(lambda x, y: (vectorizer(x), y))

In [21]:
# 1. Imports
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from tensorflow.keras.layers import TextVectorization, Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Model

# 2. Load the dataset
dataset = load_dataset("nickmuchi/financial-text-combo-classification", split="train")
df = pd.DataFrame(dataset)

# 3. Prepare inputs
texts = df["text"].astype(str).values
labels = df["label"].astype(int).values
num_classes = len(set(labels))

# 4. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# 5. Text Vectorization
max_vocab = 20000
max_len = 100
vectorizer = TextVectorization(
    max_tokens=max_vocab,
    output_mode='int',
    output_sequence_length=max_len
)
vectorizer.adapt(X_train)

# 6. Create tf.data.Dataset
batch_size = 32
AUTOTUNE = tf.data.AUTOTUNE

def to_dataset(x, y):
    ds = tf.data.Dataset.from_tensor_slices((x, y))
    ds = ds.shuffle(1000).batch(batch_size).prefetch(AUTOTUNE)
    return ds.map(lambda x, y: (vectorizer(x), y))

train_ds = to_dataset(X_train, y_train)
test_ds = to_dataset(X_test, y_test)

# 7. Build LSTM model
inputs = Input(shape=(max_len,), dtype=tf.int32)
x = Embedding(input_dim=max_vocab, output_dim=128, mask_zero=True)(inputs)
x = LSTM(64)(x)
outputs = Dense(num_classes, activation='softmax')(x)

model = Model(inputs, outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 8. Train the model
model.fit(train_ds, validation_data=test_ds, epochs=5)

# 9. Evaluate
loss, acc = model.evaluate(test_ds)
print(f"Test Accuracy: {acc:.4f}")

# 10. Predict new samples
def predict(texts):
    sequences = vectorizer(texts)
    probs = model.predict(sequences)
    return np.argmax(probs, axis=1)

# Example usage
samples = [
    "Credit card declined due to overlimit",
    "Loan approved for customer",
    "Salary credited to checking account"
]
predictions = predict(samples)
for txt, label in zip(samples, predictions):
    print(f"'{txt}' → Label: {label}")

Epoch 1/5


2025-04-20 20:58:51.842090: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m 59/450[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m13s[0m 34ms/step - accuracy: 0.5766 - loss: 0.9643

2025-04-20 20:58:55.608692: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: INVALID_ARGUMENT: assertion failed: [You are passing a RNN mask that does not correspond to right-padded sequences, while using cuDNN, which is not supported. With cuDNN, RNN masks can only be used for right-padding, e.g. `[[True, True, False, False]]` would be a valid mask, but any mask that isn\'t just contiguous `True`\'s on the left and contiguous `False`\'s on the right would be invalid. You can pass `use_cudnn=False` to your RNN layer to stop using cuDNN (this may be slower).]
	 [[{{function_node __inference_one_step_on_data_2078}}{{node functional_1/lstm_1/Assert/Assert}}]]


InvalidArgumentError: Graph execution error:

Detected at node functional_1/lstm_1/Assert/Assert defined at (most recent call last):
  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/runpy.py", line 197, in _run_module_as_main

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/runpy.py", line 87, in _run_code

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/asyncio/base_events.py", line 601, in run_forever

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/asyncio/base_events.py", line 1905, in _run_once

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/asyncio/events.py", line 80, in _run

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3048, in run_cell

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3103, in _run_cell

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3308, in run_cell_async

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3490, in run_ast_nodes

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3550, in run_code

  File "/var/folders/dw/nbws69bn1czd2gbykbfjmnmc0000gn/T/ipykernel_57994/72904297.py", line 56, in <module>

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/backend/tensorflow/trainer.py", line 371, in fit

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/backend/tensorflow/trainer.py", line 219, in function

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/backend/tensorflow/trainer.py", line 132, in multi_step_on_iterator

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/backend/tensorflow/trainer.py", line 113, in one_step_on_data

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/backend/tensorflow/trainer.py", line 57, in train_step

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/layers/layer.py", line 909, in __call__

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/ops/operation.py", line 52, in __call__

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/models/functional.py", line 183, in call

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/ops/function.py", line 171, in _run_through_graph

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/models/functional.py", line 643, in call

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/layers/layer.py", line 909, in __call__

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/ops/operation.py", line 52, in __call__

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/layers/rnn/lstm.py", line 584, in call

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/layers/rnn/rnn.py", line 408, in call

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/layers/rnn/lstm.py", line 551, in inner_loop

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/backend/tensorflow/rnn.py", line 841, in lstm

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/backend/tensorflow/rnn.py", line 874, in _cudnn_lstm

  File "/Users/sameerkhan/anaconda3/envs/skhan3/lib/python3.9/site-packages/keras/src/backend/tensorflow/rnn.py", line 557, in _assert_valid_mask

assertion failed: [You are passing a RNN mask that does not correspond to right-padded sequences, while using cuDNN, which is not supported. With cuDNN, RNN masks can only be used for right-padding, e.g. `[[True, True, False, False]]` would be a valid mask, but any mask that isn\'t just contiguous `True`\'s on the left and contiguous `False`\'s on the right would be invalid. You can pass `use_cudnn=False` to your RNN layer to stop using cuDNN (this may be slower).]
	 [[{{node functional_1/lstm_1/Assert/Assert}}]] [Op:__inference_multi_step_on_iterator_2131]

25/04/20 22:41:16 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1977910 ms exceeds timeout 120000 ms
25/04/20 22:41:16 WARN SparkContext: Killing executors is not supported by current scheduler.
25/04/20 22:41:17 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$