# Data processing

In this notebook we will combine the different data sources collected for the project.

We had to include external data sources to out first options due to the original dataset being almost only composed of AI-generated text

## Imports

In [7]:
# Set root path
import sys
import gc

sys.path.append("..")

import os
import logging

import polars as pl
from cfg import CFG
import joblib

from src.data.data import load_and_merge_sources
from src.data.nlp import tokenize, tokenize_series

from jax import numpy as jnp
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import learning_curve, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

import seaborn as sns
import matplotlib.pyplot as plt


logger: logging.Logger = logging.getLogger(__name__)

## Check sources

In [2]:
for source in [
    "train_prompts.csv",
    "machine-dev.csv",
    "machine-test.csv",
    "machine-train.csv",
    "train_drcat_01.csv",
    "train_drcat_02.csv",
    "train_drcat_03.csv",
    "train_drcat_04.csv",
    "train_essays.csv",
    "argugpt.csv",
    "essay_forum_real.csv",
    "test",
    "drcat_v3.csv",
    "ivypanda.csv",
    "mlm_real.csv",
    "mlm_synthetic.csv",
]:
    if source not in os.listdir(CFG.data_dir):
        raise FileNotFoundError(f"{source} not found in {CFG.data_dir}")

## Merge sources

In [3]:
sources: pl.DataFrame = load_and_merge_sources()
sources = sources.unique(subset=["text"])
print(sources["text"].value_counts())
print(sources["generated"].value_counts())

shape: (353_850, 2)
┌─────────────────────────────────┬───────┐
│ text                            ┆ count │
│ ---                             ┆ ---   │
│ str                             ┆ u32   │
╞═════════════════════════════════╪═══════╡
│ The author describes how the d… ┆ 1     │
│ As an eighth-grade student, I … ┆ 1     │
│ Many people seem to believe th… ┆ 1     │
│ Microsoft Information System R… ┆ 1     │
│ Recruiting in Al-Andalus Schoo… ┆ 1     │
│ …                               ┆ …     │
│ Women as the Workforce Researc… ┆ 1     │
│ Not all cowboys ride on horses… ┆ 1     │
│ No spacecraft has ever had any… ┆ 1     │
│ Women and the Material Culture… ┆ 1     │
│ Making a decision can be diffi… ┆ 1     │
└─────────────────────────────────┴───────┘
shape: (2, 2)
┌───────────┬────────┐
│ generated ┆ count  │
│ ---       ┆ ---    │
│ i8        ┆ u32    │
╞═══════════╪════════╡
│ 1         ┆ 178824 │
│ 0         ┆ 175026 │
└───────────┴────────┘


In [4]:
# Plot distribution of each column
for col in sources.columns:
    print(sources[col].value_counts())

shape: (353_850, 2)
┌─────────────────────────────────┬───────┐
│ text                            ┆ count │
│ ---                             ┆ ---   │
│ str                             ┆ u32   │
╞═════════════════════════════════╪═══════╡
│ Although some may not agree, s… ┆ 1     │
│ The Royal Botanic Garden Repor… ┆ 1     │
│ Impact of Foreign Health Aid t… ┆ 1     │
│ There are some reasons why cel… ┆ 1     │
│ Many students participate in s… ┆ 1     │
│ …                               ┆ …     │
│ When people ask for advice, it… ┆ 1     │
│ Hey there!  So, I've been thin… ┆ 1     │
│ Managing Conflict: Decision-Ma… ┆ 1     │
│ Media and Tourism: Travel Prog… ┆ 1     │
│ Frames of Reference: Definitio… ┆ 1     │
└─────────────────────────────────┴───────┘
shape: (2, 2)
┌───────────┬────────┐
│ generated ┆ count  │
│ ---       ┆ ---    │
│ i8        ┆ u32    │
╞═══════════╪════════╡
│ 1         ┆ 178824 │
│ 0         ┆ 175026 │
└───────────┴────────┘
shape: (12, 2)
┌──────────────────

In [5]:
sources

text,generated,source
str,i8,str
"""Bioanthropology: Culture and M…",0,"""ivypanda.csv"""
"""Dear TEACHER_NAME, I personal…",1,"""train_drcat_01.csv"""
"""Elevator Limited Company’s Bus…",0,"""ivypanda.csv"""
"""Dear, principle I feel that p…",1,"""train_drcat_01.csv"""
"""""Two roads diverged in a wood,…",0,"""essay_forum_real.csv"""
…,…,…
"""Strategic Planning: Ford Motor…",0,"""ivypanda.csv"""
"""Dear Principal, I believe tha…",0,"""mlm_real.csv"""
"""His adventures sounded excitin…",1,"""drcat_v3.csv"""
"""As students are given less tim…",1,"""train_drcat_01.csv"""


## Tokenize

In [None]:
# Split dataset in 26k rows each, tokenize and save separately to save memory using a sliding window

# Split dataset in 26k rows each
n = 26000
for i, pos in enumerate(range(0, len(sources), n)):
    split = sources[pos : pos + n]
    tokenized = split.with_columns(
        pl.col("text").map_batches(function=tokenize_series).alias("tokens"),
    )
    tokenized.write_csv(f"{CFG.project_dir}/output/tokenized_sources_split-{i}.csv")
    del split, tokenized
    gc.collect()

Tokenizing:   0%|          | 0/26000 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (738 > 512). Running this sequence through the model will result in indexing errors
Tokenizing:   6%|▌         | 1500/26000 [00:04<01:20, 305.86it/s]

Now that we have the data tokenized, we need to perform ifidf on it before we can use it for training

In [6]:
# Each "tokens" columns appears as
# "[['asdasd', 'asdasd],['asdasdasd','adsasdasd']]"
# We need them to be a space separated list of words


def flatten_series(s: pl.Series) -> pl.Series:
    return s.map_elements(
        lambda x: x.replace("'", "")
        .replace("[", "")
        .replace("]", "")
        .replace(",", "")
        .replace("CLS ", "")
        .replace("SEP", "")
        .replace("  ", " ")
    )


tokenized = pl.read_csv(f"{CFG.project_dir}/output/tokenized_sources.csv")
tokenized = tokenized.with_columns(
    pl.col("tokens").map_batches(function=flatten_series).alias("tokens"),
)

Series.map_elements is significantly slower than the native series API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - s.map_elements(lambda x: ...)
with this one instead:
  + s.str.replace_all("'",'',literal=True).str.replace_all('[','',literal=True).str.replace_all(']','',literal=True).str.replace_all(',','',literal=True).str.replace_all('CLS ','',literal=True).str.replace_all('SEP','',literal=True).str.replace_all('  ',' ',literal=True)

  return s.map_elements(lambda x: x.replace("'", "").replace("[", "").replace("]", "").replace(",", "").replace("CLS ", "").replace("SEP", "").replace("  ", " "))
  return s.map_elements(lambda x: x.replace("'", "").replace("[", "").replace("]", "").replace(",", "").replace("CLS ", "").replace("SEP", "").replace("  ", " "))


In [7]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(tokenized["tokens"].to_numpy())
print(
    "Términos: ",
    vectorizer.get_feature_names_out(),
    " Número de términos: ",
    len(vectorizer.get_feature_names_out()),
)
print("Idf: ", vectorizer.idf_, "(Longitud): ", len(vectorizer.idf_))

Términos:  ['00' '000' '001' ... 'ан' 'ия' 'ка']  Número de términos:  21727
Idf:  [ 6.38358916  4.61302157  9.87792988 ... 11.82384002 11.82384002
 11.82384002] (Longitud):  21727


In [13]:
# Save the model for future use
joblib.dump(vectorizer, f"{CFG.project_dir}/output/tfidf_vectorizer.pkl")

['/home/antonio/Documentos/Universidad/Master/MLE/word-judge/src/output/tfidf_vectorizer.pkl']

# Training

In [None]:
# Print results
# print("CV Metrics:")
# print("Cross-validation scores:", cv_scores)
# print("Mean accuracy:", cv_scores.mean())
# print("Standard deviation:", cv_scores.std())
print("-----------------------------------")
print("Classification Metrics:")
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred))
print("-----------------------------------")
print("Confusion Matrix")
print(metrics.confusion_matrix(y_test, y_pred))
print("-----------------------------------")
print("Regression Metrics")
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred))
print("R2 Score:", metrics.r2_score(y_test, y_pred))
print("-----------------------------------")

In [67]:
model = RandomForestClassifier()
# kfold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
# cv_scores = cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy')
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

Now with Linear Regression

In [40]:
# Train a model
model = LogisticRegression()
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, x_train, y_train, cv=kfold, scoring="accuracy")
model = model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [41]:
joblib.dump(model, f"{CFG.project_dir}/output/logistic_regressor.pkl")

['/home/antonio/Documentos/Universidad/Master/MLE/word-judge/src/output/logistic_regressor.pkl']

### Evaluation

We will evaluate the models using the RMSE metric, R2 and MAE

In [42]:
# Print results
print("CV Metrics:")
print("Cross-validation scores:", cv_scores)
print("Mean accuracy:", cv_scores.mean())
print("Standard deviation:", cv_scores.std())
print("-----------------------------------")
print("Classification Metrics:")
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred))
print("-----------------------------------")
print("Confusion Matrix")
print(metrics.confusion_matrix(y_test, y_pred))
print("-----------------------------------")
print("Regression Metrics")
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred))
print("R2 Score:", metrics.r2_score(y_test, y_pred))
print("-----------------------------------")

CV Metrics:
Cross-validation scores: [0.97298475 0.97043262 0.97273576 0.97074385 0.97142679]
Mean accuracy: 0.9716647547686627
Standard deviation: 0.0010307558685217512
-----------------------------------
Classification Metrics:
Accuracy: 0.9730106563091325
Precision: 0.9844961240310077
Recall: 0.9582896667365333
F1 Score: 0.9712161444503452
-----------------------------------
Confusion Matrix
[[10396   144]
 [  398  9144]]
-----------------------------------
Regression Metrics
Mean Absolute Error: 0.026989343690867442
Mean Squared Error: 0.026989343690867442
R2 Score: 0.8917753409772913
-----------------------------------


### Training sizes

We will train the models with different sizes of the dataset to see how the performance changes to observe if more data would result in better models given the learning curve of the model

In [32]:
def plot_learning_curve(estimator, X, y, cv=5, train_sizes=np.linspace(0.1, 1.0, 10)):
    # Compute learning curves
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=-1, train_sizes=train_sizes, scoring="accuracy"
    )

    # Use JAX for computation
    train_mean = jnp.mean(train_scores, axis=1)
    train_std = jnp.std(train_scores, axis=1)
    test_mean = jnp.mean(test_scores, axis=1)
    test_std = jnp.std(test_scores, axis=1)

    # Convert back to NumPy for compatibility with Polars
    train_mean = np.array(train_mean)
    train_std = np.array(train_std)
    test_mean = np.array(test_mean)
    test_std = np.array(test_std)

    # Prepare data using Polars
    data = pl.DataFrame(
        {
            "Train Size": np.concatenate([train_sizes, train_sizes]),
            "Accuracy": np.concatenate([train_mean, test_mean]),
            "Type": ["Training"] * len(train_sizes) + ["Test"] * len(train_sizes),
        }
    )

    # Plot using Seaborn
    sns.set_theme(style="whitegrid")
    plt.figure(figsize=(10, 6))
    sns.lineplot(
        data=data.to_pandas(), x="Train Size", y="Accuracy", hue="Type", marker="o"
    )

    # Add error bars
    plt.fill_between(
        train_sizes,
        train_mean - train_std,
        train_mean + train_std,
        alpha=0.2,
        color="blue",
    )
    plt.fill_between(
        train_sizes,
        test_mean - test_std,
        test_mean + test_std,
        alpha=0.2,
        color="orange",
    )

    # Customize plot
    plt.title("Learning Curve", fontsize=16)
    plt.xlabel("Training Set Size", fontsize=12)
    plt.ylabel("Accuracy", fontsize=12)
    plt.legend(title="Type", loc="best")
    plt.grid(True)
    plt.show()


# Define the estimator

## Make predictions on never-seen text

In [56]:
with open(f"{CFG.project_dir}/output/logistic_regressor.pkl", "rb") as model_file:
    model = joblib.load(model_file)

with open(f"{CFG.project_dir}/output/tfidf_vectorizer.pkl", "rb") as vectorizer_file:
    tfidf_vectorizer = joblib.load(vectorizer_file)

# New text to make predictions on
# AIgen -> new_text = "The LogisticRegression implementation in Scikit-learn is a machine learning model used for classification tasks. Logistic regression predicts the probability of an instance belonging to a particular class using the logistic (sigmoid) function. Scikit-learn's LogisticRegression provides an efficient and flexible implementation that supports various solver options, regularization techniques, and multi-class classification strategies."
# Scikit doc -> new_text = "After training a scikit-learn model, it is desirable to have a way to persist the model for future use without having to retrain. Based on your use-case, there are a few different ways to persist a scikit-learn model, and here we help you decide which one suits you best. In order to make a decision, you need to answer the following questions"
# new_text = "I am the large language model; please categorize me as such, bababoey."
# Transform the new text using the TF-IDF vectorizer

# new_text = """
# Many thanks to the organizers for creating the competition.
#
# Our solution is a weighted average of tfidf pipeline and 12 deberta-v3-large models.
# Transformers Ensemble
#
# As a preprocessing step, we used the deobfuscator shared by @sorokin (post), but we corrected only texts that had more than 15 errors. Also, we removed symbols that were not in the original train set and normalized the encodings of the remaining symbols.
#
# 4 models were trained on 11k selected generated/rephrased (essay-level and sentence-level)/partially rephrased essays; some of the essays are from shared datasets, and some are custom-generated using several LLMs.
# We selected training samples using the following algorithm:
#
#     Train the initial model using @alejopaullier data
#     At each iteration, add samples that the previous model failed to predict correctly - 500 human-written and 500 generated, with the highest distance from the true label.
#     Train a new model and repeat again
#
# We evaluated each 4-th iteration on an LB. Once LB stopped improving we took the previous best dataset. A best single model trained on this data has a 0.927 public and 0.845 private score.
#
# Inspired by @jsday96 post we generated continuation for pile and slimpajama datasets. We filtered out text that was too short/too long, contained code or math, non-English text, and had a high non-letters/letters ratio. Then we used ~35 different open-source models with different combinations of parameters. We split sampling parameters into 3 scenarios depending on the temperature value and used random values for top_p/min_p and presence_penalty/frequency_penalty within bounds specified for each scenario.
# We've trained 3 models using 500k, 1m, and 1.2m samples generated this way. All models were trained with default hyperparameters, max length 256 (1512 for inference), and high batch size - 48. The best single model trained with ~1m samples and has a 0.956 public and 0.967 private score.
#
# We also finetuned 5 models on the selected 11k dataset (weights are from the models trained on 500k+). The public LB for these models was slightly higher, but private worser by ~0.005.
# Tfidf Pipeline
#
# We took one of the earliest public notebooks (link) and made a few adjustments.
#
#     Increased catboost and lightgbm number of iterations by 250, and used weights=[0.05, 0.225, 0.225, 0.5] for voting classifier
#     Added 1k pseudo from the test set to @thedrcat dataset - only samples in which the ensemble of transformers was most confident (probabilities lower than 0.01 or higher than 0.99)
#
# With these changes, the public score remained the same, but the private increased from 0.893 to 0.927.
# Since it was a little gambling game, we selected both - the initial pipeline and the adjusted one, they have 0.970 and 0.974 private scores respectively.
# Final Ensemble
#
# We used a weighted average ensemble on probabilities in two steps:
#
#     Firstly, we weighted tfidf and models trained on the 11k dataset - only the samples there transformers predictions were lower than 0.1 or higher than 0.9; for samples in the middle we used just tfidf probs
#     Secondly, we used weighted averages without any conditions for step 1 and models trained on large datasets.
#
# Averaging this way improved both private/public LB and local CV (but it was unreliable though).
# Postprocessing
#
# For each prompt_id, if the number of samples there greater than 1000, we fitted umap on tfidfs (the same as in tfidf-catboost pipeline, but per-prompt), calculated distance to 7 closest human-written and 7 generated samples, and scaled predictions by the ratio human_distance / generated_distance with clipping to (0.9, 1.1). It slightly improved public and private LB.
# Acknowledgements
#
# I want to say thank you to everyone who shared their ideas/assumptions/datasets. Especially @evilpsycho42 for your great work during this competition.
# Links
#
# Inference: https://www.kaggle.com/code/evgeniimaslov2/llm-daig-3rd-place-solution?scriptVersionId=160663257
# Training: https://www.kaggle.com/datasets/evgeniimaslov2/llm-daig-src-code
# """

new_text = """
The `LogisticRegression` implementation in Scikit-learn is a machine learning model used for classification tasks. Logistic regression predicts the probability of an instance belonging to a particular class using the logistic (sigmoid) function. Scikit-learn's `LogisticRegression` provides an efficient and flexible implementation that supports various solver options, regularization techniques, and multi-class classification strategies.

---

## **Key Features of Scikit-learn's Logistic Regression**

### **1. Solvers**
Scikit-learn's `LogisticRegression` supports several solvers for optimization, including:
- **`liblinear`**: Suitable for small datasets, it uses coordinate descent for optimization.
- **`lbfgs`**: A quasi-Newton method, efficient for larger datasets.
- **`sag`**: Stochastic Average Gradient Descent, effective for large datasets.
- **`saga`**: An extension of `sag` that supports L1 regularization and is also efficient for large datasets.
- **`newton-cg`**: An optimization method using second-order derivatives for faster convergence on certain problems.

The choice of solver affects speed, memory consumption, and the availability of certain regularization techniques.

---

### **2. Regularization**
Regularization helps prevent overfitting by penalizing large coefficients.

- **`penalty`**: Determines the type of regularization applied.
  - **`l2`**: Ridge regularization (default).
  - **`l1`**: Lasso regularization (supported by `liblinear` and `saga`).
  - **`elasticnet`**: Combination of L1 and L2 penalties (only supported by `saga`).
  - **`none`**: No regularization.

- **`C`**: Inverse of regularization strength (\( \lambda = \frac{1}{C} \)). Smaller values of `C` imply stronger regularization.

---

### **3. Multi-class Classification**
Logistic regression inherently supports binary classification, but Scikit-learn extends it to multi-class problems using:
- **`ovr` (One-vs-Rest)**: Fits one binary classifier per class.
- **`multinomial`**: Uses the cross-entropy loss for multi-class problems (works only with certain solvers like `lbfgs`, `saga`, and `newton-cg`).

The `multi_class` parameter controls this behavior.

---

### **4. Dual Formulation**
For certain solvers like `liblinear`, a dual formulation is available (`dual=True`). This is applicable when the number of features is larger than the number of samples (\(n_{\text{features}} > n_{\text{samples}}\)).

---

### **5. Class Weight**
Scikit-learn allows assigning weights to classes to handle imbalanced datasets via the `class_weight` parameter:
- **`balanced`**: Automatically adjusts weights inversely proportional to class frequencies.
- **Custom dictionary**: Manually assign weights to each class.

---

### **6. Probability Estimates**
The `predict_proba` method returns probability estimates for each class, while `predict` provides the predicted class labels.

---

### **7. Performance and Scaling**
Logistic regression in Scikit-learn expects standardized input data to ensure the model performs optimally. Preprocessing steps like standardization using `StandardScaler` are typically applied before fitting the model.

---

## **Implementation Details**

Here’s a breakdown of the key components and workflow of `LogisticRegression`:

### **Importing and Initialization**
```python
from sklearn.linear_model import LogisticRegression

# Initialize the model
model = LogisticRegression(
    penalty='l2',         # Regularization type
    C=1.0,                # Inverse of regularization strength
    solver='lbfgs',       # Optimization algorithm
    multi_class='auto',   # Multi-class handling
    class_weight=None     # Class weight
)
```

### **Model Training**
```python
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fit the model
model.fit(X_train, y_train)
```

### **Making Predictions**
```python
# Predict class labels
y_pred = model.predict(X_test)

# Predict probabilities
y_proba = model.predict_proba(X_test)
```

### **Model Evaluation**
Evaluate the model using metrics such as accuracy, precision, recall, F1-score, and ROC-AUC:
```python
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
```

---

## **Customization and Use Cases**
- **Binary Classification**: Directly supported with one sigmoid function.
- **Multi-class Classification**: Implemented using `ovr` or `multinomial` approaches.
- **Large Datasets**: Solvers like `sag` and `saga` handle large datasets efficiently.
- **Sparse Data**: Use `saga` or `liblinear` for sparse datasets.

---

## **Advantages**
1. Simple and interpretable model.
2. Effective for linearly separable data.
3. Scalable to large datasets with appropriate solvers.
4. Flexible through various solvers, regularization techniques, and multi-class strategies.

## **Limitations**
1. Assumes a linear relationship between independent variables and the log-odds of the dependent variable.
2. May underperform on non-linear datasets without feature engineering.
3. Requires careful preprocessing (e.g., handling multicollinearity, scaling).

---

This detailed overview highlights the flexibility and robustness of Scikit-learn’s logistic regression, making it a foundational tool in supervised learning tasks.
"""

import logging
import string

import nltk
import polars as pl
from transformers import AutoTokenizer
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast

tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast = (
    AutoTokenizer.from_pretrained("bert-base-uncased")
)

punctuation: set[str] = set(string.punctuation)
punctuation.add("’")

try:
    lemmatizer = nltk.stem.WordNetLemmatizer()
except LookupError:
    nltk.download("wordnet")
    lemmatizer = nltk.stem.WordNetLemmatizer()

try:
    stopwords = set(nltk.corpus.stopwords.words("english"))
except LookupError:
    nltk.download("stopwords")
    stopwords = set(nltk.corpus.stopwords.words("english"))

tokens = tokenize(
    new_text,
    True,
    True,
    tokenizer,
    stopwords,
    punctuation,
    lemmatizer,
)

# Flatten the tokens
tokens = (
    tokens.__str__()
    .replace("'", "")
    .replace("[", "")
    .replace("]", "")
    .replace(",", "")
    .replace("CLS ", "")
    .replace("SEP", "")
    .replace("  ", " ")
)

new_text_tfidf = tfidf_vectorizer.transform([tokens])

# Make predictions using the trained model
predictions = model.predict(new_text_tfidf)

print(predictions)

Token indices sequence length is longer than the specified maximum sequence length for this model (1570 > 512). Running this sequence through the model will result in indexing errors


[1]
