# Downstream Model (LSTM) for v2

In [40]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.metrics import classification_report
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [41]:
# Load the data
data = pd.read_csv("labeled_data_non-overlapped_labeling_fn_v2.csv")

# Split the data into non-abstained and abstained datasets
non_abstained_data = data[data['label'] != 'ABSTAIN']
abstained_data = data[data['label'] == 'ABSTAIN']

# Extract text and labels from non-abstained data
train_data = non_abstained_data['concatenated_title_abstract']
y = non_abstained_data['label']

# Initialize the label encoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# Fit the label encoder to the labels and transform
y_integer_encoded = label_encoder.fit_transform(y)

# Now, one-hot encode the integer encoded labels
y_encoded = to_categorical(y_integer_encoded)

In [42]:
# Tokenize the data
MAX_NB_WORDS = 5000
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train_data.values)
X = tokenizer.texts_to_sequences(train_data.values)
X = pad_sequences(X)

In [43]:
num_classes = y.nunique()

# Adjust the LSTM model's output layer to match the number of classes
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [44]:
# Convert labels from non-abstained data to one-hot encoding
y_encoded = to_categorical(y)

In [45]:
# Split the non-abstained data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [46]:
# Train the LSTM model
batch_size = 64
epochs = 5
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1)

Epoch 1/5


ValueError: in user code:

    File "/home/u22/harsh24/.local/lib/python3.8/site-packages/keras/src/engine/training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "/home/u22/harsh24/.local/lib/python3.8/site-packages/keras/src/engine/training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/u22/harsh24/.local/lib/python3.8/site-packages/keras/src/engine/training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "/home/u22/harsh24/.local/lib/python3.8/site-packages/keras/src/engine/training.py", line 1081, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/home/u22/harsh24/.local/lib/python3.8/site-packages/keras/src/engine/training.py", line 1139, in compute_loss
        return self.compiled_loss(
    File "/home/u22/harsh24/.local/lib/python3.8/site-packages/keras/src/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/home/u22/harsh24/.local/lib/python3.8/site-packages/keras/src/losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "/home/u22/harsh24/.local/lib/python3.8/site-packages/keras/src/losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/home/u22/harsh24/.local/lib/python3.8/site-packages/keras/src/losses.py", line 2122, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/home/u22/harsh24/.local/lib/python3.8/site-packages/keras/src/backend.py", line 5560, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 995) and (None, 198) are incompatible


In [None]:
# Evaluate on test set from non-abstained data
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)
print(classification_report(y_test_classes, y_pred_classes))

In [47]:
print(y.nunique())

198


In [48]:
print(y_encoded.shape[1])

995


In [1]:
pip install --upgrade jinja2 nbconvert

Defaulting to user installation because normal site-packages is not writeable
Collecting nbconvert
  Downloading nbconvert-7.9.2-py3-none-any.whl (256 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m256.4/256.4 KB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting tinycss2
  Downloading tinycss2-1.2.1-py3-none-any.whl (21 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.12.2-py3-none-any.whl (142 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.0/143.0 KB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mistune<4,>=2.0.3
  Downloading mistune-3.0.2-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.0/48.0 KB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nbformat>=5.7
  Downloading nbformat-5.9.2-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.6/77.6 KB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting fastjson

In [3]:
pip uninstall jinja2 nbconvert mistune

Found existing installation: Jinja2 3.1.2
Uninstalling Jinja2-3.1.2:
  Would remove:
    /home/u22/harsh24/.local/lib/python3.8/site-packages/Jinja2-3.1.2.dist-info/*
    /home/u22/harsh24/.local/lib/python3.8/site-packages/jinja2/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
y

NameError: name 'y' is not defined