In [None]:
#imports

import pandas as pd
import torch
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import train_test_split
from numpy import interp
import matplotlib.pyplot as plt
import logging

In [None]:
df = pd.read_json('gutenberg-dataset-v2.json')

In [None]:
#df = df[df['text'].apply(lambda text: len(text) > 1024)]
df['text'] = df['text'].apply(lambda text: ' '.join(text.split()[:512]))
earliest_date = df['date'].min()
latest_date = df['date'].max()

df['labels'] = df['date'].apply(lambda date: interp(date, [earliest_date, latest_date], [0, 1]))

# Split data into train and dev subsets
df_small = df.sample(1000, random_state=42)
df_train, df_test = train_test_split(df_small, test_size=0.2, random_state=42, shuffle=True)
df_train

In [None]:
# Train and Evaluation data needs to be in a Pandas Dataframe containing at least two columns.
# If the Dataframe has a header, it should contain a 'text' and a 'labels' column.
# If no header is present, the Dataframe should contain at least two columns,
# with the first column is the text with type str, and the second column in the label with type int.

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.WARNING)

cuda_available = torch.cuda.is_available()
model_args = ClassificationArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.regression = True
model_args.num_train_epochs = 4
#model_args.learning_rate = 1e-4

# Create a ClassificationModel
model = ClassificationModel(
    'roberta',
    'roberta-base',
    num_labels=1,
    args=model_args,
    use_cuda=cuda_available,
)

# Train the model
model.train_model(df_train)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(df_test)

In [None]:
def get_date_from_prediction(prediction):
    return int(interp(prediction, [0, 1], [earliest_date, latest_date]))

In [None]:
print(result)

predictions, raw_outputs = model.predict(df_test['text'].tolist())
prediction_dates = [get_date_from_prediction(pred) for pred in predictions]

for i in range(len(df_test)):
    print(f"Title: {df_test.iloc[i]['title']}, Date: {df_test.iloc[i]['date']}, Predicted Date: {prediction_dates[i]}")

plt.figure(figsize=(10, 6))
plt.scatter(df_test['date'].tolist(), prediction_dates, color='blue', alpha=0.5)
plt.title('Real vs Predicted Publish Dates')
plt.xlabel('Real Publish Date')
plt.ylabel('Predicted Date')
plt.grid(True)
plt.show()